51b3a0fad43a31480347c494132d8bcfeb6ec4d4
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.rbmaradio import RBMARadioIE
45 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
46 from .extractor.stanfordoc import StanfordOpenClassroomIE
47 from .extractor.steam import SteamIE
48 from .extractor.ted import TEDIE
49 from .extractor.ustream import UstreamIE
50 from .extractor.vimeo import VimeoIE
51 from .extractor.worldstarhiphop import WorldStarHipHopIE
52 from .extractor.xnxx import XNXXIE
53 from .extractor.xvideos import XVideosIE
54 from .extractor.yahoo import YahooIE, YahooSearchIE
55 from .extractor.youku import YoukuIE
56 from .extractor.youporn import YouPornIE
57 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
58 from .extractor.zdf import ZDFIE
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 class PornotubeIE(InfoExtractor):
83     """Information extractor for pornotube.com."""
84     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
85
86     def _real_extract(self, url):
87         mobj = re.match(self._VALID_URL, url)
88         if mobj is None:
89             raise ExtractorError(u'Invalid URL: %s' % url)
90
91         video_id = mobj.group('videoid')
92         video_title = mobj.group('title')
93
94         # Get webpage content
95         webpage = self._download_webpage(url, video_id)
96
97         # Get the video URL
98         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
99         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
100         video_url = compat_urllib_parse.unquote(video_url)
101
102         #Get the uploaded date
103         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
104         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
105         if upload_date: upload_date = unified_strdate(upload_date)
106
107         info = {'id': video_id,
108                 'url': video_url,
109                 'uploader': None,
110                 'upload_date': upload_date,
111                 'title': video_title,
112                 'ext': 'flv',
113                 'format': 'flv'}
114
115         return [info]
116
117 class YouJizzIE(InfoExtractor):
118     """Information extractor for youjizz.com."""
119     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
120
121     def _real_extract(self, url):
122         mobj = re.match(self._VALID_URL, url)
123         if mobj is None:
124             raise ExtractorError(u'Invalid URL: %s' % url)
125
126         video_id = mobj.group('videoid')
127
128         # Get webpage content
129         webpage = self._download_webpage(url, video_id)
130
131         # Get the video title
132         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
133             webpage, u'title').strip()
134
135         # Get the embed page
136         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
137         if result is None:
138             raise ExtractorError(u'ERROR: unable to extract embed page')
139
140         embed_page_url = result.group(0).strip()
141         video_id = result.group('videoid')
142
143         webpage = self._download_webpage(embed_page_url, video_id)
144
145         # Get the video URL
146         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
147             webpage, u'video URL')
148
149         info = {'id': video_id,
150                 'url': video_url,
151                 'title': video_title,
152                 'ext': 'flv',
153                 'format': 'flv',
154                 'player_url': embed_page_url}
155
156         return [info]
157
158 class EightTracksIE(InfoExtractor):
159     IE_NAME = '8tracks'
160     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
161
162     def _real_extract(self, url):
163         mobj = re.match(self._VALID_URL, url)
164         if mobj is None:
165             raise ExtractorError(u'Invalid URL: %s' % url)
166         playlist_id = mobj.group('id')
167
168         webpage = self._download_webpage(url, playlist_id)
169
170         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
171         data = json.loads(json_like)
172
173         session = str(random.randint(0, 1000000000))
174         mix_id = data['id']
175         track_count = data['tracks_count']
176         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
177         next_url = first_url
178         res = []
179         for i in itertools.count():
180             api_json = self._download_webpage(next_url, playlist_id,
181                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
182                 errnote=u'Failed to download song information')
183             api_data = json.loads(api_json)
184             track_data = api_data[u'set']['track']
185             info = {
186                 'id': track_data['id'],
187                 'url': track_data['track_file_stream_url'],
188                 'title': track_data['performer'] + u' - ' + track_data['name'],
189                 'raw_title': track_data['name'],
190                 'uploader_id': data['user']['login'],
191                 'ext': 'm4a',
192             }
193             res.append(info)
194             if api_data['set']['at_last_track']:
195                 break
196             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
197         return res
198
199 class KeekIE(InfoExtractor):
200     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
201     IE_NAME = u'keek'
202
203     def _real_extract(self, url):
204         m = re.match(self._VALID_URL, url)
205         video_id = m.group('videoID')
206
207         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
208         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
209         webpage = self._download_webpage(url, video_id)
210
211         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
212             webpage, u'title')
213
214         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
215             webpage, u'uploader', fatal=False)
216
217         info = {
218                 'id': video_id,
219                 'url': video_url,
220                 'ext': 'mp4',
221                 'title': video_title,
222                 'thumbnail': thumbnail,
223                 'uploader': uploader
224         }
225         return [info]
226
227
228 class MySpassIE(InfoExtractor):
229     _VALID_URL = r'http://www.myspass.de/.*'
230
231     def _real_extract(self, url):
232         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
233
234         # video id is the last path element of the URL
235         # usually there is a trailing slash, so also try the second but last
236         url_path = compat_urllib_parse_urlparse(url).path
237         url_parent_path, video_id = os.path.split(url_path)
238         if not video_id:
239             _, video_id = os.path.split(url_parent_path)
240
241         # get metadata
242         metadata_url = META_DATA_URL_TEMPLATE % video_id
243         metadata_text = self._download_webpage(metadata_url, video_id)
244         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
245
246         # extract values from metadata
247         url_flv_el = metadata.find('url_flv')
248         if url_flv_el is None:
249             raise ExtractorError(u'Unable to extract download url')
250         video_url = url_flv_el.text
251         extension = os.path.splitext(video_url)[1][1:]
252         title_el = metadata.find('title')
253         if title_el is None:
254             raise ExtractorError(u'Unable to extract title')
255         title = title_el.text
256         format_id_el = metadata.find('format_id')
257         if format_id_el is None:
258             format = ext
259         else:
260             format = format_id_el.text
261         description_el = metadata.find('description')
262         if description_el is not None:
263             description = description_el.text
264         else:
265             description = None
266         imagePreview_el = metadata.find('imagePreview')
267         if imagePreview_el is not None:
268             thumbnail = imagePreview_el.text
269         else:
270             thumbnail = None
271         info = {
272             'id': video_id,
273             'url': video_url,
274             'title': title,
275             'ext': extension,
276             'format': format,
277             'thumbnail': thumbnail,
278             'description': description
279         }
280         return [info]
281
282 class SpiegelIE(InfoExtractor):
283     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
284
285     def _real_extract(self, url):
286         m = re.match(self._VALID_URL, url)
287         video_id = m.group('videoID')
288
289         webpage = self._download_webpage(url, video_id)
290
291         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
292             webpage, u'title')
293
294         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
295         xml_code = self._download_webpage(xml_url, video_id,
296                     note=u'Downloading XML', errnote=u'Failed to download XML')
297
298         idoc = xml.etree.ElementTree.fromstring(xml_code)
299         last_type = idoc[-1]
300         filename = last_type.findall('./filename')[0].text
301         duration = float(last_type.findall('./duration')[0].text)
302
303         video_url = 'http://video2.spiegel.de/flash/' + filename
304         video_ext = filename.rpartition('.')[2]
305         info = {
306             'id': video_id,
307             'url': video_url,
308             'ext': video_ext,
309             'title': video_title,
310             'duration': duration,
311         }
312         return [info]
313
314 class LiveLeakIE(InfoExtractor):
315
316     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
317     IE_NAME = u'liveleak'
318
319     def _real_extract(self, url):
320         mobj = re.match(self._VALID_URL, url)
321         if mobj is None:
322             raise ExtractorError(u'Invalid URL: %s' % url)
323
324         video_id = mobj.group('video_id')
325
326         webpage = self._download_webpage(url, video_id)
327
328         video_url = self._search_regex(r'file: "(.*?)",',
329             webpage, u'video URL')
330
331         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
332             webpage, u'title').replace('LiveLeak.com -', '').strip()
333
334         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
335             webpage, u'description', fatal=False)
336
337         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
338             webpage, u'uploader', fatal=False)
339
340         info = {
341             'id':  video_id,
342             'url': video_url,
343             'ext': 'mp4',
344             'title': video_title,
345             'description': video_description,
346             'uploader': video_uploader
347         }
348
349         return [info]
350
351
352
353 class TumblrIE(InfoExtractor):
354     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
355
356     def _real_extract(self, url):
357         m_url = re.match(self._VALID_URL, url)
358         video_id = m_url.group('id')
359         blog = m_url.group('blog_name')
360
361         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
362         webpage = self._download_webpage(url, video_id)
363
364         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
365         video = re.search(re_video, webpage)
366         if video is None:
367            raise ExtractorError(u'Unable to extract video')
368         video_url = video.group('video_url')
369         ext = video.group('ext')
370
371         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
372             webpage, u'thumbnail', fatal=False)  # We pick the first poster
373         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
374
375         # The only place where you can get a title, it's not complete,
376         # but searching in other places doesn't work for all videos
377         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
378             webpage, u'title', flags=re.DOTALL)
379
380         return [{'id': video_id,
381                  'url': video_url,
382                  'title': video_title,
383                  'thumbnail': video_thumbnail,
384                  'ext': ext
385                  }]
386
387 class BandcampIE(InfoExtractor):
388     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
389
390     def _real_extract(self, url):
391         mobj = re.match(self._VALID_URL, url)
392         title = mobj.group('title')
393         webpage = self._download_webpage(url, title)
394         # We get the link to the free download page
395         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
396         if m_download is None:
397             raise ExtractorError(u'No free songs found')
398
399         download_link = m_download.group(1)
400         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
401                        webpage, re.MULTILINE|re.DOTALL).group('id')
402
403         download_webpage = self._download_webpage(download_link, id,
404                                                   'Downloading free downloads page')
405         # We get the dictionary of the track from some javascrip code
406         info = re.search(r'items: (.*?),$',
407                          download_webpage, re.MULTILINE).group(1)
408         info = json.loads(info)[0]
409         # We pick mp3-320 for now, until format selection can be easily implemented.
410         mp3_info = info[u'downloads'][u'mp3-320']
411         # If we try to use this url it says the link has expired
412         initial_url = mp3_info[u'url']
413         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
414         m_url = re.match(re_url, initial_url)
415         #We build the url we will use to get the final track url
416         # This url is build in Bandcamp in the script download_bunde_*.js
417         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
418         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
419         # If we could correctly generate the .rand field the url would be
420         #in the "download_url" key
421         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
422
423         track_info = {'id':id,
424                       'title' : info[u'title'],
425                       'ext' :   'mp3',
426                       'url' :   final_url,
427                       'thumbnail' : info[u'thumb_url'],
428                       'uploader' :  info[u'artist']
429                       }
430
431         return [track_info]
432
433 class RedTubeIE(InfoExtractor):
434     """Information Extractor for redtube"""
435     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
436
437     def _real_extract(self,url):
438         mobj = re.match(self._VALID_URL, url)
439         if mobj is None:
440             raise ExtractorError(u'Invalid URL: %s' % url)
441
442         video_id = mobj.group('id')
443         video_extension = 'mp4'        
444         webpage = self._download_webpage(url, video_id)
445
446         self.report_extraction(video_id)
447
448         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
449             webpage, u'video URL')
450
451         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
452             webpage, u'title')
453
454         return [{
455             'id':       video_id,
456             'url':      video_url,
457             'ext':      video_extension,
458             'title':    video_title,
459         }]
460         
461 class InaIE(InfoExtractor):
462     """Information Extractor for Ina.fr"""
463     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
464
465     def _real_extract(self,url):
466         mobj = re.match(self._VALID_URL, url)
467
468         video_id = mobj.group('id')
469         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
470         video_extension = 'mp4'
471         webpage = self._download_webpage(mrss_url, video_id)
472
473         self.report_extraction(video_id)
474
475         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
476             webpage, u'video URL')
477
478         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
479             webpage, u'title')
480
481         return [{
482             'id':       video_id,
483             'url':      video_url,
484             'ext':      video_extension,
485             'title':    video_title,
486         }]
487
488 class HowcastIE(InfoExtractor):
489     """Information Extractor for Howcast.com"""
490     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
491
492     def _real_extract(self, url):
493         mobj = re.match(self._VALID_URL, url)
494
495         video_id = mobj.group('id')
496         webpage_url = 'http://www.howcast.com/videos/' + video_id
497         webpage = self._download_webpage(webpage_url, video_id)
498
499         self.report_extraction(video_id)
500
501         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
502             webpage, u'video URL')
503
504         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
505             webpage, u'title')
506
507         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
508             webpage, u'description', fatal=False)
509
510         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
511             webpage, u'thumbnail', fatal=False)
512
513         return [{
514             'id':       video_id,
515             'url':      video_url,
516             'ext':      'mp4',
517             'title':    video_title,
518             'description': video_description,
519             'thumbnail': thumbnail,
520         }]
521
522 class VineIE(InfoExtractor):
523     """Information Extractor for Vine.co"""
524     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
525
526     def _real_extract(self, url):
527         mobj = re.match(self._VALID_URL, url)
528
529         video_id = mobj.group('id')
530         webpage_url = 'https://vine.co/v/' + video_id
531         webpage = self._download_webpage(webpage_url, video_id)
532
533         self.report_extraction(video_id)
534
535         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
536             webpage, u'video URL')
537
538         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
539             webpage, u'title')
540
541         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
542             webpage, u'thumbnail', fatal=False)
543
544         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
545             webpage, u'uploader', fatal=False, flags=re.DOTALL)
546
547         return [{
548             'id':        video_id,
549             'url':       video_url,
550             'ext':       'mp4',
551             'title':     video_title,
552             'thumbnail': thumbnail,
553             'uploader':  uploader,
554         }]
555
556 class FlickrIE(InfoExtractor):
557     """Information Extractor for Flickr videos"""
558     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
559
560     def _real_extract(self, url):
561         mobj = re.match(self._VALID_URL, url)
562
563         video_id = mobj.group('id')
564         video_uploader_id = mobj.group('uploader_id')
565         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
566         webpage = self._download_webpage(webpage_url, video_id)
567
568         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
569
570         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
571         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
572
573         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
574             first_xml, u'node_id')
575
576         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
577         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
578
579         self.report_extraction(video_id)
580
581         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
582         if mobj is None:
583             raise ExtractorError(u'Unable to extract video url')
584         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
585
586         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
587             webpage, u'video title')
588
589         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
590             webpage, u'description', fatal=False)
591
592         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
593             webpage, u'thumbnail', fatal=False)
594
595         return [{
596             'id':          video_id,
597             'url':         video_url,
598             'ext':         'mp4',
599             'title':       video_title,
600             'description': video_description,
601             'thumbnail':   thumbnail,
602             'uploader_id': video_uploader_id,
603         }]
604
605 class TeamcocoIE(InfoExtractor):
606     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
607
608     def _real_extract(self, url):
609         mobj = re.match(self._VALID_URL, url)
610         if mobj is None:
611             raise ExtractorError(u'Invalid URL: %s' % url)
612         url_title = mobj.group('url_title')
613         webpage = self._download_webpage(url, url_title)
614
615         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
616             webpage, u'video id')
617
618         self.report_extraction(video_id)
619
620         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
621             webpage, u'title')
622
623         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
624             webpage, u'thumbnail', fatal=False)
625
626         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
627             webpage, u'description', fatal=False)
628
629         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
630         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
631
632         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
633             data, u'video URL')
634
635         return [{
636             'id':          video_id,
637             'url':         video_url,
638             'ext':         'mp4',
639             'title':       video_title,
640             'thumbnail':   thumbnail,
641             'description': video_description,
642         }]
643
644 class XHamsterIE(InfoExtractor):
645     """Information Extractor for xHamster"""
646     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
647
648     def _real_extract(self,url):
649         mobj = re.match(self._VALID_URL, url)
650
651         video_id = mobj.group('id')
652         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
653         webpage = self._download_webpage(mrss_url, video_id)
654
655         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
656         if mobj is None:
657             raise ExtractorError(u'Unable to extract media URL')
658         if len(mobj.group('server')) == 0:
659             video_url = compat_urllib_parse.unquote(mobj.group('file'))
660         else:
661             video_url = mobj.group('server')+'/key='+mobj.group('file')
662         video_extension = video_url.split('.')[-1]
663
664         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
665             webpage, u'title')
666
667         # Can't see the description anywhere in the UI
668         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
669         #     webpage, u'description', fatal=False)
670         # if video_description: video_description = unescapeHTML(video_description)
671
672         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
673         if mobj:
674             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
675         else:
676             video_upload_date = None
677             self._downloader.report_warning(u'Unable to extract upload date')
678
679         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
680             webpage, u'uploader id', default=u'anonymous')
681
682         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
683             webpage, u'thumbnail', fatal=False)
684
685         return [{
686             'id':       video_id,
687             'url':      video_url,
688             'ext':      video_extension,
689             'title':    video_title,
690             # 'description': video_description,
691             'upload_date': video_upload_date,
692             'uploader_id': video_uploader_id,
693             'thumbnail': video_thumbnail
694         }]
695
696 class HypemIE(InfoExtractor):
697     """Information Extractor for hypem"""
698     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
699
700     def _real_extract(self, url):
701         mobj = re.match(self._VALID_URL, url)
702         if mobj is None:
703             raise ExtractorError(u'Invalid URL: %s' % url)
704         track_id = mobj.group(1)
705
706         data = { 'ax': 1, 'ts': time.time() }
707         data_encoded = compat_urllib_parse.urlencode(data)
708         complete_url = url + "?" + data_encoded
709         request = compat_urllib_request.Request(complete_url)
710         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
711         cookie = urlh.headers.get('Set-Cookie', '')
712
713         self.report_extraction(track_id)
714
715         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
716             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
717         try:
718             track_list = json.loads(html_tracks)
719             track = track_list[u'tracks'][0]
720         except ValueError:
721             raise ExtractorError(u'Hypemachine contained invalid JSON.')
722
723         key = track[u"key"]
724         track_id = track[u"id"]
725         artist = track[u"artist"]
726         title = track[u"song"]
727
728         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
729         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
730         request.add_header('cookie', cookie)
731         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
732         try:
733             song_data = json.loads(song_data_json)
734         except ValueError:
735             raise ExtractorError(u'Hypemachine contained invalid JSON.')
736         final_url = song_data[u"url"]
737
738         return [{
739             'id':       track_id,
740             'url':      final_url,
741             'ext':      "mp3",
742             'title':    title,
743             'artist':   artist,
744         }]
745
746 class Vbox7IE(InfoExtractor):
747     """Information Extractor for Vbox7"""
748     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
749
750     def _real_extract(self,url):
751         mobj = re.match(self._VALID_URL, url)
752         if mobj is None:
753             raise ExtractorError(u'Invalid URL: %s' % url)
754         video_id = mobj.group(1)
755
756         redirect_page, urlh = self._download_webpage_handle(url, video_id)
757         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
758         redirect_url = urlh.geturl() + new_location
759         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
760
761         title = self._html_search_regex(r'<title>(.*)</title>',
762             webpage, u'title').split('/')[0].strip()
763
764         ext = "flv"
765         info_url = "http://vbox7.com/play/magare.do"
766         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
767         info_request = compat_urllib_request.Request(info_url, data)
768         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
769         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
770         if info_response is None:
771             raise ExtractorError(u'Unable to extract the media url')
772         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
773
774         return [{
775             'id':        video_id,
776             'url':       final_url,
777             'ext':       ext,
778             'title':     title,
779             'thumbnail': thumbnail_url,
780         }]
781
782
783 def gen_extractors():
784     """ Return a list of an instance of every supported extractor.
785     The order does matter; the first extractor matched is the one handling the URL.
786     """
787     return [
788         YoutubePlaylistIE(),
789         YoutubeChannelIE(),
790         YoutubeUserIE(),
791         YoutubeSearchIE(),
792         YoutubeIE(),
793         MetacafeIE(),
794         DailymotionIE(),
795         GoogleSearchIE(),
796         PhotobucketIE(),
797         YahooIE(),
798         YahooSearchIE(),
799         DepositFilesIE(),
800         FacebookIE(),
801         BlipTVIE(),
802         BlipTVUserIE(),
803         VimeoIE(),
804         MyVideoIE(),
805         ComedyCentralIE(),
806         EscapistIE(),
807         CollegeHumorIE(),
808         XVideosIE(),
809         SoundcloudSetIE(),
810         SoundcloudIE(),
811         InfoQIE(),
812         MixcloudIE(),
813         StanfordOpenClassroomIE(),
814         MTVIE(),
815         YoukuIE(),
816         XNXXIE(),
817         YouJizzIE(),
818         PornotubeIE(),
819         YouPornIE(),
820         GooglePlusIE(),
821         ArteTvIE(),
822         NBAIE(),
823         WorldStarHipHopIE(),
824         JustinTVIE(),
825         FunnyOrDieIE(),
826         SteamIE(),
827         UstreamIE(),
828         RBMARadioIE(),
829         EightTracksIE(),
830         KeekIE(),
831         TEDIE(),
832         MySpassIE(),
833         SpiegelIE(),
834         LiveLeakIE(),
835         ARDIE(),
836         ZDFIE(),
837         TumblrIE(),
838         BandcampIE(),
839         RedTubeIE(),
840         InaIE(),
841         HowcastIE(),
842         VineIE(),
843         FlickrIE(),
844         TeamcocoIE(),
845         XHamsterIE(),
846         HypemIE(),
847         Vbox7IE(),
848         GametrailersIE(),
849         StatigramIE(),
850         GenericIE()
851     ]
852
853 def get_info_extractor(ie_name):
854     """Returns the info extractor class with the given ie_name"""
855     return globals()[ie_name+'IE']