Add facebook import
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.facebook import FacebookIE
27 from .extractor.gametrailers import GametrailersIE
28 from .extractor.generic import GenericIE
29 from .extractor.googleplus import GooglePlusIE
30 from .extractor.googlesearch import GoogleSearchIE
31 from .extractor.metacafe import MetacafeIE
32 from .extractor.myvideo import MyVideoIE
33 from .extractor.statigram import StatigramIE
34 from .extractor.photobucket import PhotobucketIE
35 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
36 from .extractor.vimeo import VimeoIE
37 from .extractor.yahoo import YahooIE, YahooSearchIE
38 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
39 from .extractor.zdf import ZDFIE
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59 class DepositFilesIE(InfoExtractor):
60     """Information extractor for depositfiles.com"""
61
62     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
63
64     def _real_extract(self, url):
65         file_id = url.split('/')[-1]
66         # Rebuild url in english locale
67         url = 'http://depositfiles.com/en/files/' + file_id
68
69         # Retrieve file webpage with 'Free download' button pressed
70         free_download_indication = { 'gateway_result' : '1' }
71         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
72         try:
73             self.report_download_webpage(file_id)
74             webpage = compat_urllib_request.urlopen(request).read()
75         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
76             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
77
78         # Search for the real file URL
79         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
80         if (mobj is None) or (mobj.group(1) is None):
81             # Try to figure out reason of the error.
82             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
83             if (mobj is not None) and (mobj.group(1) is not None):
84                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
85                 raise ExtractorError(u'%s' % restriction_message)
86             else:
87                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
88
89         file_url = mobj.group(1)
90         file_extension = os.path.splitext(file_url)[1][1:]
91
92         # Search for file title
93         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
94
95         return [{
96             'id':       file_id.decode('utf-8'),
97             'url':      file_url.decode('utf-8'),
98             'uploader': None,
99             'upload_date':  None,
100             'title':    file_title,
101             'ext':      file_extension.decode('utf-8'),
102         }]
103
104
105
106
107
108
109
110
111
112 class EscapistIE(InfoExtractor):
113     """Information extractor for The Escapist """
114
115     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
116     IE_NAME = u'escapist'
117
118     def _real_extract(self, url):
119         mobj = re.match(self._VALID_URL, url)
120         if mobj is None:
121             raise ExtractorError(u'Invalid URL: %s' % url)
122         showName = mobj.group('showname')
123         videoId = mobj.group('episode')
124
125         self.report_extraction(videoId)
126         webpage = self._download_webpage(url, videoId)
127
128         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
129             webpage, u'description', fatal=False)
130
131         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
132             webpage, u'thumbnail', fatal=False)
133
134         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
135             webpage, u'player url')
136
137         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
138             webpage, u'player url').split(' : ')[-1]
139
140         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
141         configUrl = compat_urllib_parse.unquote(configUrl)
142
143         configJSON = self._download_webpage(configUrl, videoId,
144                                             u'Downloading configuration',
145                                             u'unable to download configuration')
146
147         # Technically, it's JavaScript, not JSON
148         configJSON = configJSON.replace("'", '"')
149
150         try:
151             config = json.loads(configJSON)
152         except (ValueError,) as err:
153             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
154
155         playlist = config['playlist']
156         videoUrl = playlist[1]['url']
157
158         info = {
159             'id': videoId,
160             'url': videoUrl,
161             'uploader': showName,
162             'upload_date': None,
163             'title': title,
164             'ext': 'mp4',
165             'thumbnail': imgUrl,
166             'description': videoDesc,
167             'player_url': playerUrl,
168         }
169
170         return [info]
171
172 class CollegeHumorIE(InfoExtractor):
173     """Information extractor for collegehumor.com"""
174
175     _WORKING = False
176     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
177     IE_NAME = u'collegehumor'
178
179     def report_manifest(self, video_id):
180         """Report information extraction."""
181         self.to_screen(u'%s: Downloading XML manifest' % video_id)
182
183     def _real_extract(self, url):
184         mobj = re.match(self._VALID_URL, url)
185         if mobj is None:
186             raise ExtractorError(u'Invalid URL: %s' % url)
187         video_id = mobj.group('videoid')
188
189         info = {
190             'id': video_id,
191             'uploader': None,
192             'upload_date': None,
193         }
194
195         self.report_extraction(video_id)
196         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
197         try:
198             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
199         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
200             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
201
202         mdoc = xml.etree.ElementTree.fromstring(metaXml)
203         try:
204             videoNode = mdoc.findall('./video')[0]
205             info['description'] = videoNode.findall('./description')[0].text
206             info['title'] = videoNode.findall('./caption')[0].text
207             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
208             manifest_url = videoNode.findall('./file')[0].text
209         except IndexError:
210             raise ExtractorError(u'Invalid metadata XML file')
211
212         manifest_url += '?hdcore=2.10.3'
213         self.report_manifest(video_id)
214         try:
215             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
216         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
217             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
218
219         adoc = xml.etree.ElementTree.fromstring(manifestXml)
220         try:
221             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
222             node_id = media_node.attrib['url']
223             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
224         except IndexError as err:
225             raise ExtractorError(u'Invalid manifest file')
226
227         url_pr = compat_urllib_parse_urlparse(manifest_url)
228         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
229
230         info['url'] = url
231         info['ext'] = 'f4f'
232         return [info]
233
234
235 class XVideosIE(InfoExtractor):
236     """Information extractor for xvideos.com"""
237
238     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
239     IE_NAME = u'xvideos'
240
241     def _real_extract(self, url):
242         mobj = re.match(self._VALID_URL, url)
243         if mobj is None:
244             raise ExtractorError(u'Invalid URL: %s' % url)
245         video_id = mobj.group(1)
246
247         webpage = self._download_webpage(url, video_id)
248
249         self.report_extraction(video_id)
250
251         # Extract video URL
252         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
253             webpage, u'video URL'))
254
255         # Extract title
256         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
257             webpage, u'title')
258
259         # Extract video thumbnail
260         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
261             webpage, u'thumbnail', fatal=False)
262
263         info = {
264             'id': video_id,
265             'url': video_url,
266             'uploader': None,
267             'upload_date': None,
268             'title': video_title,
269             'ext': 'flv',
270             'thumbnail': video_thumbnail,
271             'description': None,
272         }
273
274         return [info]
275
276
277
278
279 class InfoQIE(InfoExtractor):
280     """Information extractor for infoq.com"""
281     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
282
283     def _real_extract(self, url):
284         mobj = re.match(self._VALID_URL, url)
285         if mobj is None:
286             raise ExtractorError(u'Invalid URL: %s' % url)
287
288         webpage = self._download_webpage(url, video_id=url)
289         self.report_extraction(url)
290
291         # Extract video URL
292         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
293         if mobj is None:
294             raise ExtractorError(u'Unable to extract video url')
295         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
296         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
297
298         # Extract title
299         video_title = self._search_regex(r'contentTitle = "(.*?)";',
300             webpage, u'title')
301
302         # Extract description
303         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
304             webpage, u'description', fatal=False)
305
306         video_filename = video_url.split('/')[-1]
307         video_id, extension = video_filename.split('.')
308
309         info = {
310             'id': video_id,
311             'url': video_url,
312             'uploader': None,
313             'upload_date': None,
314             'title': video_title,
315             'ext': extension, # Extension is always(?) mp4, but seems to be flv
316             'thumbnail': None,
317             'description': video_description,
318         }
319
320         return [info]
321
322 class MixcloudIE(InfoExtractor):
323     """Information extractor for www.mixcloud.com"""
324
325     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
326     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
327     IE_NAME = u'mixcloud'
328
329     def report_download_json(self, file_id):
330         """Report JSON download."""
331         self.to_screen(u'Downloading json')
332
333     def get_urls(self, jsonData, fmt, bitrate='best'):
334         """Get urls from 'audio_formats' section in json"""
335         file_url = None
336         try:
337             bitrate_list = jsonData[fmt]
338             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
339                 bitrate = max(bitrate_list) # select highest
340
341             url_list = jsonData[fmt][bitrate]
342         except TypeError: # we have no bitrate info.
343             url_list = jsonData[fmt]
344         return url_list
345
346     def check_urls(self, url_list):
347         """Returns 1st active url from list"""
348         for url in url_list:
349             try:
350                 compat_urllib_request.urlopen(url)
351                 return url
352             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
353                 url = None
354
355         return None
356
357     def _print_formats(self, formats):
358         print('Available formats:')
359         for fmt in formats.keys():
360             for b in formats[fmt]:
361                 try:
362                     ext = formats[fmt][b][0]
363                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
364                 except TypeError: # we have no bitrate info
365                     ext = formats[fmt][0]
366                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
367                     break
368
369     def _real_extract(self, url):
370         mobj = re.match(self._VALID_URL, url)
371         if mobj is None:
372             raise ExtractorError(u'Invalid URL: %s' % url)
373         # extract uploader & filename from url
374         uploader = mobj.group(1).decode('utf-8')
375         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
376
377         # construct API request
378         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
379         # retrieve .json file with links to files
380         request = compat_urllib_request.Request(file_url)
381         try:
382             self.report_download_json(file_url)
383             jsonData = compat_urllib_request.urlopen(request).read()
384         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
385             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
386
387         # parse JSON
388         json_data = json.loads(jsonData)
389         player_url = json_data['player_swf_url']
390         formats = dict(json_data['audio_formats'])
391
392         req_format = self._downloader.params.get('format', None)
393         bitrate = None
394
395         if self._downloader.params.get('listformats', None):
396             self._print_formats(formats)
397             return
398
399         if req_format is None or req_format == 'best':
400             for format_param in formats.keys():
401                 url_list = self.get_urls(formats, format_param)
402                 # check urls
403                 file_url = self.check_urls(url_list)
404                 if file_url is not None:
405                     break # got it!
406         else:
407             if req_format not in formats:
408                 raise ExtractorError(u'Format is not available')
409
410             url_list = self.get_urls(formats, req_format)
411             file_url = self.check_urls(url_list)
412             format_param = req_format
413
414         return [{
415             'id': file_id.decode('utf-8'),
416             'url': file_url.decode('utf-8'),
417             'uploader': uploader.decode('utf-8'),
418             'upload_date': None,
419             'title': json_data['name'],
420             'ext': file_url.split('.')[-1].decode('utf-8'),
421             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
422             'thumbnail': json_data['thumbnail_url'],
423             'description': json_data['description'],
424             'player_url': player_url.decode('utf-8'),
425         }]
426
427 class StanfordOpenClassroomIE(InfoExtractor):
428     """Information extractor for Stanford's Open ClassRoom"""
429
430     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
431     IE_NAME = u'stanfordoc'
432
433     def _real_extract(self, url):
434         mobj = re.match(self._VALID_URL, url)
435         if mobj is None:
436             raise ExtractorError(u'Invalid URL: %s' % url)
437
438         if mobj.group('course') and mobj.group('video'): # A specific video
439             course = mobj.group('course')
440             video = mobj.group('video')
441             info = {
442                 'id': course + '_' + video,
443                 'uploader': None,
444                 'upload_date': None,
445             }
446
447             self.report_extraction(info['id'])
448             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
449             xmlUrl = baseUrl + video + '.xml'
450             try:
451                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
452             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
454             mdoc = xml.etree.ElementTree.fromstring(metaXml)
455             try:
456                 info['title'] = mdoc.findall('./title')[0].text
457                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
458             except IndexError:
459                 raise ExtractorError(u'Invalid metadata XML file')
460             info['ext'] = info['url'].rpartition('.')[2]
461             return [info]
462         elif mobj.group('course'): # A course page
463             course = mobj.group('course')
464             info = {
465                 'id': course,
466                 'type': 'playlist',
467                 'uploader': None,
468                 'upload_date': None,
469             }
470
471             coursepage = self._download_webpage(url, info['id'],
472                                         note='Downloading course info page',
473                                         errnote='Unable to download course info page')
474
475             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
476
477             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
478                 coursepage, u'description', fatal=False)
479
480             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
481             info['list'] = [
482                 {
483                     'type': 'reference',
484                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
485                 }
486                     for vpage in links]
487             results = []
488             for entry in info['list']:
489                 assert entry['type'] == 'reference'
490                 results += self.extract(entry['url'])
491             return results
492         else: # Root page
493             info = {
494                 'id': 'Stanford OpenClassroom',
495                 'type': 'playlist',
496                 'uploader': None,
497                 'upload_date': None,
498             }
499
500             self.report_download_webpage(info['id'])
501             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
502             try:
503                 rootpage = compat_urllib_request.urlopen(rootURL).read()
504             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
505                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
506
507             info['title'] = info['id']
508
509             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
510             info['list'] = [
511                 {
512                     'type': 'reference',
513                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
514                 }
515                     for cpage in links]
516
517             results = []
518             for entry in info['list']:
519                 assert entry['type'] == 'reference'
520                 results += self.extract(entry['url'])
521             return results
522
523 class MTVIE(InfoExtractor):
524     """Information extractor for MTV.com"""
525
526     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
527     IE_NAME = u'mtv'
528
529     def _real_extract(self, url):
530         mobj = re.match(self._VALID_URL, url)
531         if mobj is None:
532             raise ExtractorError(u'Invalid URL: %s' % url)
533         if not mobj.group('proto'):
534             url = 'http://' + url
535         video_id = mobj.group('videoid')
536
537         webpage = self._download_webpage(url, video_id)
538
539         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
540             webpage, u'song name', fatal=False)
541
542         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
543             webpage, u'title')
544
545         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
546             webpage, u'mtvn_uri', fatal=False)
547
548         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
549             webpage, u'content id', fatal=False)
550
551         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
552         self.report_extraction(video_id)
553         request = compat_urllib_request.Request(videogen_url)
554         try:
555             metadataXml = compat_urllib_request.urlopen(request).read()
556         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
557             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
558
559         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
560         renditions = mdoc.findall('.//rendition')
561
562         # For now, always pick the highest quality.
563         rendition = renditions[-1]
564
565         try:
566             _,_,ext = rendition.attrib['type'].partition('/')
567             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
568             video_url = rendition.find('./src').text
569         except KeyError:
570             raise ExtractorError('Invalid rendition field.')
571
572         info = {
573             'id': video_id,
574             'url': video_url,
575             'uploader': performer,
576             'upload_date': None,
577             'title': video_title,
578             'ext': ext,
579             'format': format,
580         }
581
582         return [info]
583
584
585 class YoukuIE(InfoExtractor):
586     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
587
588     def _gen_sid(self):
589         nowTime = int(time.time() * 1000)
590         random1 = random.randint(1000,1998)
591         random2 = random.randint(1000,9999)
592
593         return "%d%d%d" %(nowTime,random1,random2)
594
595     def _get_file_ID_mix_string(self, seed):
596         mixed = []
597         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
598         seed = float(seed)
599         for i in range(len(source)):
600             seed  =  (seed * 211 + 30031 ) % 65536
601             index  =  math.floor(seed / 65536 * len(source) )
602             mixed.append(source[int(index)])
603             source.remove(source[int(index)])
604         #return ''.join(mixed)
605         return mixed
606
607     def _get_file_id(self, fileId, seed):
608         mixed = self._get_file_ID_mix_string(seed)
609         ids = fileId.split('*')
610         realId = []
611         for ch in ids:
612             if ch:
613                 realId.append(mixed[int(ch)])
614         return ''.join(realId)
615
616     def _real_extract(self, url):
617         mobj = re.match(self._VALID_URL, url)
618         if mobj is None:
619             raise ExtractorError(u'Invalid URL: %s' % url)
620         video_id = mobj.group('ID')
621
622         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
623
624         jsondata = self._download_webpage(info_url, video_id)
625
626         self.report_extraction(video_id)
627         try:
628             config = json.loads(jsondata)
629
630             video_title =  config['data'][0]['title']
631             seed = config['data'][0]['seed']
632
633             format = self._downloader.params.get('format', None)
634             supported_format = list(config['data'][0]['streamfileids'].keys())
635
636             if format is None or format == 'best':
637                 if 'hd2' in supported_format:
638                     format = 'hd2'
639                 else:
640                     format = 'flv'
641                 ext = u'flv'
642             elif format == 'worst':
643                 format = 'mp4'
644                 ext = u'mp4'
645             else:
646                 format = 'flv'
647                 ext = u'flv'
648
649
650             fileid = config['data'][0]['streamfileids'][format]
651             keys = [s['k'] for s in config['data'][0]['segs'][format]]
652         except (UnicodeDecodeError, ValueError, KeyError):
653             raise ExtractorError(u'Unable to extract info section')
654
655         files_info=[]
656         sid = self._gen_sid()
657         fileid = self._get_file_id(fileid, seed)
658
659         #column 8,9 of fileid represent the segment number
660         #fileid[7:9] should be changed
661         for index, key in enumerate(keys):
662
663             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
664             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
665
666             info = {
667                 'id': '%s_part%02d' % (video_id, index),
668                 'url': download_url,
669                 'uploader': None,
670                 'upload_date': None,
671                 'title': video_title,
672                 'ext': ext,
673             }
674             files_info.append(info)
675
676         return files_info
677
678
679 class XNXXIE(InfoExtractor):
680     """Information extractor for xnxx.com"""
681
682     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
683     IE_NAME = u'xnxx'
684     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
685     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
686     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
687
688     def _real_extract(self, url):
689         mobj = re.match(self._VALID_URL, url)
690         if mobj is None:
691             raise ExtractorError(u'Invalid URL: %s' % url)
692         video_id = mobj.group(1)
693
694         # Get webpage content
695         webpage = self._download_webpage(url, video_id)
696
697         video_url = self._search_regex(self.VIDEO_URL_RE,
698             webpage, u'video URL')
699         video_url = compat_urllib_parse.unquote(video_url)
700
701         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
702             webpage, u'title')
703
704         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
705             webpage, u'thumbnail', fatal=False)
706
707         return [{
708             'id': video_id,
709             'url': video_url,
710             'uploader': None,
711             'upload_date': None,
712             'title': video_title,
713             'ext': 'flv',
714             'thumbnail': video_thumbnail,
715             'description': None,
716         }]
717
718
719
720 class NBAIE(InfoExtractor):
721     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
722     IE_NAME = u'nba'
723
724     def _real_extract(self, url):
725         mobj = re.match(self._VALID_URL, url)
726         if mobj is None:
727             raise ExtractorError(u'Invalid URL: %s' % url)
728
729         video_id = mobj.group(1)
730
731         webpage = self._download_webpage(url, video_id)
732
733         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
734
735         shortened_video_id = video_id.rpartition('/')[2]
736         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
737             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
738
739         # It isn't there in the HTML it returns to us
740         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
741
742         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
743
744         info = {
745             'id': shortened_video_id,
746             'url': video_url,
747             'ext': 'mp4',
748             'title': title,
749             # 'uploader_date': uploader_date,
750             'description': description,
751         }
752         return [info]
753
754 class JustinTVIE(InfoExtractor):
755     """Information extractor for justin.tv and twitch.tv"""
756     # TODO: One broadcast may be split into multiple videos. The key
757     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
758     # starts at 1 and increases. Can we treat all parts as one video?
759
760     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
761         (?:
762             (?P<channelid>[^/]+)|
763             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
764             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
765         )
766         /?(?:\#.*)?$
767         """
768     _JUSTIN_PAGE_LIMIT = 100
769     IE_NAME = u'justin.tv'
770
771     def report_download_page(self, channel, offset):
772         """Report attempt to download a single page of videos."""
773         self.to_screen(u'%s: Downloading video information from %d to %d' %
774                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
775
776     # Return count of items, list of *valid* items
777     def _parse_page(self, url, video_id):
778         webpage = self._download_webpage(url, video_id,
779                                          u'Downloading video info JSON',
780                                          u'unable to download video info JSON')
781
782         response = json.loads(webpage)
783         if type(response) != list:
784             error_text = response.get('error', 'unknown error')
785             raise ExtractorError(u'Justin.tv API: %s' % error_text)
786         info = []
787         for clip in response:
788             video_url = clip['video_file_url']
789             if video_url:
790                 video_extension = os.path.splitext(video_url)[1][1:]
791                 video_date = re.sub('-', '', clip['start_time'][:10])
792                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
793                 video_id = clip['id']
794                 video_title = clip.get('title', video_id)
795                 info.append({
796                     'id': video_id,
797                     'url': video_url,
798                     'title': video_title,
799                     'uploader': clip.get('channel_name', video_uploader_id),
800                     'uploader_id': video_uploader_id,
801                     'upload_date': video_date,
802                     'ext': video_extension,
803                 })
804         return (len(response), info)
805
806     def _real_extract(self, url):
807         mobj = re.match(self._VALID_URL, url)
808         if mobj is None:
809             raise ExtractorError(u'invalid URL: %s' % url)
810
811         api_base = 'http://api.justin.tv'
812         paged = False
813         if mobj.group('channelid'):
814             paged = True
815             video_id = mobj.group('channelid')
816             api = api_base + '/channel/archives/%s.json' % video_id
817         elif mobj.group('chapterid'):
818             chapter_id = mobj.group('chapterid')
819
820             webpage = self._download_webpage(url, chapter_id)
821             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
822             if not m:
823                 raise ExtractorError(u'Cannot find archive of a chapter')
824             archive_id = m.group(1)
825
826             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
827             chapter_info_xml = self._download_webpage(api, chapter_id,
828                                              note=u'Downloading chapter information',
829                                              errnote=u'Chapter information download failed')
830             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
831             for a in doc.findall('.//archive'):
832                 if archive_id == a.find('./id').text:
833                     break
834             else:
835                 raise ExtractorError(u'Could not find chapter in chapter information')
836
837             video_url = a.find('./video_file_url').text
838             video_ext = video_url.rpartition('.')[2] or u'flv'
839
840             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
841             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
842                                    note='Downloading chapter metadata',
843                                    errnote='Download of chapter metadata failed')
844             chapter_info = json.loads(chapter_info_json)
845
846             bracket_start = int(doc.find('.//bracket_start').text)
847             bracket_end = int(doc.find('.//bracket_end').text)
848
849             # TODO determine start (and probably fix up file)
850             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
851             #video_url += u'?start=' + TODO:start_timestamp
852             # bracket_start is 13290, but we want 51670615
853             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
854                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
855
856             info = {
857                 'id': u'c' + chapter_id,
858                 'url': video_url,
859                 'ext': video_ext,
860                 'title': chapter_info['title'],
861                 'thumbnail': chapter_info['preview'],
862                 'description': chapter_info['description'],
863                 'uploader': chapter_info['channel']['display_name'],
864                 'uploader_id': chapter_info['channel']['name'],
865             }
866             return [info]
867         else:
868             video_id = mobj.group('videoid')
869             api = api_base + '/broadcast/by_archive/%s.json' % video_id
870
871         self.report_extraction(video_id)
872
873         info = []
874         offset = 0
875         limit = self._JUSTIN_PAGE_LIMIT
876         while True:
877             if paged:
878                 self.report_download_page(video_id, offset)
879             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
880             page_count, page_info = self._parse_page(page_url, video_id)
881             info.extend(page_info)
882             if not paged or page_count != limit:
883                 break
884             offset += limit
885         return info
886
887 class FunnyOrDieIE(InfoExtractor):
888     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
889
890     def _real_extract(self, url):
891         mobj = re.match(self._VALID_URL, url)
892         if mobj is None:
893             raise ExtractorError(u'invalid URL: %s' % url)
894
895         video_id = mobj.group('id')
896         webpage = self._download_webpage(url, video_id)
897
898         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
899             webpage, u'video URL', flags=re.DOTALL)
900
901         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
902             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
903
904         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
905             webpage, u'description', fatal=False, flags=re.DOTALL)
906
907         info = {
908             'id': video_id,
909             'url': video_url,
910             'ext': 'mp4',
911             'title': title,
912             'description': video_description,
913         }
914         return [info]
915
916 class SteamIE(InfoExtractor):
917     _VALID_URL = r"""http://store\.steampowered\.com/
918                 (agecheck/)?
919                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
920                 (?P<gameID>\d+)/?
921                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
922                 """
923     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
924     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
925
926     @classmethod
927     def suitable(cls, url):
928         """Receives a URL and returns True if suitable for this IE."""
929         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
930
931     def _real_extract(self, url):
932         m = re.match(self._VALID_URL, url, re.VERBOSE)
933         gameID = m.group('gameID')
934
935         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
936         webpage = self._download_webpage(videourl, gameID)
937
938         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
939             videourl = self._AGECHECK_TEMPLATE % gameID
940             self.report_age_confirmation()
941             webpage = self._download_webpage(videourl, gameID)
942
943         self.report_extraction(gameID)
944         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
945                                              webpage, 'game title')
946
947         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
948         mweb = re.finditer(urlRE, webpage)
949         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
950         titles = re.finditer(namesRE, webpage)
951         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
952         thumbs = re.finditer(thumbsRE, webpage)
953         videos = []
954         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
955             video_id = vid.group('videoID')
956             title = vtitle.group('videoName')
957             video_url = vid.group('videoURL')
958             video_thumb = thumb.group('thumbnail')
959             if not video_url:
960                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
961             info = {
962                 'id':video_id,
963                 'url':video_url,
964                 'ext': 'flv',
965                 'title': unescapeHTML(title),
966                 'thumbnail': video_thumb
967                   }
968             videos.append(info)
969         return [self.playlist_result(videos, gameID, game_title)]
970
971 class UstreamIE(InfoExtractor):
972     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
973     IE_NAME = u'ustream'
974
975     def _real_extract(self, url):
976         m = re.match(self._VALID_URL, url)
977         video_id = m.group('videoID')
978
979         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
980         webpage = self._download_webpage(url, video_id)
981
982         self.report_extraction(video_id)
983
984         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
985             webpage, u'title')
986
987         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
988             webpage, u'uploader', fatal=False, flags=re.DOTALL)
989
990         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
991             webpage, u'thumbnail', fatal=False)
992
993         info = {
994                 'id': video_id,
995                 'url': video_url,
996                 'ext': 'flv',
997                 'title': video_title,
998                 'uploader': uploader,
999                 'thumbnail': thumbnail,
1000                }
1001         return info
1002
1003 class WorldStarHipHopIE(InfoExtractor):
1004     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1005     IE_NAME = u'WorldStarHipHop'
1006
1007     def _real_extract(self, url):
1008         m = re.match(self._VALID_URL, url)
1009         video_id = m.group('id')
1010
1011         webpage_src = self._download_webpage(url, video_id)
1012
1013         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1014             webpage_src, u'video URL')
1015
1016         if 'mp4' in video_url:
1017             ext = 'mp4'
1018         else:
1019             ext = 'flv'
1020
1021         video_title = self._html_search_regex(r"<title>(.*)</title>",
1022             webpage_src, u'title')
1023
1024         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1025         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1026             webpage_src, u'thumbnail', fatal=False)
1027
1028         if not thumbnail:
1029             _title = r"""candytitles.*>(.*)</span>"""
1030             mobj = re.search(_title, webpage_src)
1031             if mobj is not None:
1032                 video_title = mobj.group(1)
1033
1034         results = [{
1035                     'id': video_id,
1036                     'url' : video_url,
1037                     'title' : video_title,
1038                     'thumbnail' : thumbnail,
1039                     'ext' : ext,
1040                     }]
1041         return results
1042
1043 class RBMARadioIE(InfoExtractor):
1044     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1045
1046     def _real_extract(self, url):
1047         m = re.match(self._VALID_URL, url)
1048         video_id = m.group('videoID')
1049
1050         webpage = self._download_webpage(url, video_id)
1051
1052         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1053             webpage, u'json data', flags=re.MULTILINE)
1054
1055         try:
1056             data = json.loads(json_data)
1057         except ValueError as e:
1058             raise ExtractorError(u'Invalid JSON: ' + str(e))
1059
1060         video_url = data['akamai_url'] + '&cbr=256'
1061         url_parts = compat_urllib_parse_urlparse(video_url)
1062         video_ext = url_parts.path.rpartition('.')[2]
1063         info = {
1064                 'id': video_id,
1065                 'url': video_url,
1066                 'ext': video_ext,
1067                 'title': data['title'],
1068                 'description': data.get('teaser_text'),
1069                 'location': data.get('country_of_origin'),
1070                 'uploader': data.get('host', {}).get('name'),
1071                 'uploader_id': data.get('host', {}).get('slug'),
1072                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1073                 'duration': data.get('duration'),
1074         }
1075         return [info]
1076
1077
1078 class YouPornIE(InfoExtractor):
1079     """Information extractor for youporn.com."""
1080     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1081
1082     def _print_formats(self, formats):
1083         """Print all available formats"""
1084         print(u'Available formats:')
1085         print(u'ext\t\tformat')
1086         print(u'---------------------------------')
1087         for format in formats:
1088             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1089
1090     def _specific(self, req_format, formats):
1091         for x in formats:
1092             if(x["format"]==req_format):
1093                 return x
1094         return None
1095
1096     def _real_extract(self, url):
1097         mobj = re.match(self._VALID_URL, url)
1098         if mobj is None:
1099             raise ExtractorError(u'Invalid URL: %s' % url)
1100         video_id = mobj.group('videoid')
1101
1102         req = compat_urllib_request.Request(url)
1103         req.add_header('Cookie', 'age_verified=1')
1104         webpage = self._download_webpage(req, video_id)
1105
1106         # Get JSON parameters
1107         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1108         try:
1109             params = json.loads(json_params)
1110         except:
1111             raise ExtractorError(u'Invalid JSON')
1112
1113         self.report_extraction(video_id)
1114         try:
1115             video_title = params['title']
1116             upload_date = unified_strdate(params['release_date_f'])
1117             video_description = params['description']
1118             video_uploader = params['submitted_by']
1119             thumbnail = params['thumbnails'][0]['image']
1120         except KeyError:
1121             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1122
1123         # Get all of the formats available
1124         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1125         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1126             webpage, u'download list').strip()
1127
1128         # Get all of the links from the page
1129         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1130         links = re.findall(LINK_RE, download_list_html)
1131         if(len(links) == 0):
1132             raise ExtractorError(u'ERROR: no known formats available for video')
1133
1134         self.to_screen(u'Links found: %d' % len(links))
1135
1136         formats = []
1137         for link in links:
1138
1139             # A link looks like this:
1140             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1141             # A path looks like this:
1142             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1143             video_url = unescapeHTML( link )
1144             path = compat_urllib_parse_urlparse( video_url ).path
1145             extension = os.path.splitext( path )[1][1:]
1146             format = path.split('/')[4].split('_')[:2]
1147             size = format[0]
1148             bitrate = format[1]
1149             format = "-".join( format )
1150             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1151
1152             formats.append({
1153                 'id': video_id,
1154                 'url': video_url,
1155                 'uploader': video_uploader,
1156                 'upload_date': upload_date,
1157                 'title': video_title,
1158                 'ext': extension,
1159                 'format': format,
1160                 'thumbnail': thumbnail,
1161                 'description': video_description
1162             })
1163
1164         if self._downloader.params.get('listformats', None):
1165             self._print_formats(formats)
1166             return
1167
1168         req_format = self._downloader.params.get('format', None)
1169         self.to_screen(u'Format: %s' % req_format)
1170
1171         if req_format is None or req_format == 'best':
1172             return [formats[0]]
1173         elif req_format == 'worst':
1174             return [formats[-1]]
1175         elif req_format in ('-1', 'all'):
1176             return formats
1177         else:
1178             format = self._specific( req_format, formats )
1179             if result is None:
1180                 raise ExtractorError(u'Requested format not available')
1181             return [format]
1182
1183
1184
1185 class PornotubeIE(InfoExtractor):
1186     """Information extractor for pornotube.com."""
1187     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1188
1189     def _real_extract(self, url):
1190         mobj = re.match(self._VALID_URL, url)
1191         if mobj is None:
1192             raise ExtractorError(u'Invalid URL: %s' % url)
1193
1194         video_id = mobj.group('videoid')
1195         video_title = mobj.group('title')
1196
1197         # Get webpage content
1198         webpage = self._download_webpage(url, video_id)
1199
1200         # Get the video URL
1201         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1202         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1203         video_url = compat_urllib_parse.unquote(video_url)
1204
1205         #Get the uploaded date
1206         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1207         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1208         if upload_date: upload_date = unified_strdate(upload_date)
1209
1210         info = {'id': video_id,
1211                 'url': video_url,
1212                 'uploader': None,
1213                 'upload_date': upload_date,
1214                 'title': video_title,
1215                 'ext': 'flv',
1216                 'format': 'flv'}
1217
1218         return [info]
1219
1220 class YouJizzIE(InfoExtractor):
1221     """Information extractor for youjizz.com."""
1222     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1223
1224     def _real_extract(self, url):
1225         mobj = re.match(self._VALID_URL, url)
1226         if mobj is None:
1227             raise ExtractorError(u'Invalid URL: %s' % url)
1228
1229         video_id = mobj.group('videoid')
1230
1231         # Get webpage content
1232         webpage = self._download_webpage(url, video_id)
1233
1234         # Get the video title
1235         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1236             webpage, u'title').strip()
1237
1238         # Get the embed page
1239         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1240         if result is None:
1241             raise ExtractorError(u'ERROR: unable to extract embed page')
1242
1243         embed_page_url = result.group(0).strip()
1244         video_id = result.group('videoid')
1245
1246         webpage = self._download_webpage(embed_page_url, video_id)
1247
1248         # Get the video URL
1249         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1250             webpage, u'video URL')
1251
1252         info = {'id': video_id,
1253                 'url': video_url,
1254                 'title': video_title,
1255                 'ext': 'flv',
1256                 'format': 'flv',
1257                 'player_url': embed_page_url}
1258
1259         return [info]
1260
1261 class EightTracksIE(InfoExtractor):
1262     IE_NAME = '8tracks'
1263     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1264
1265     def _real_extract(self, url):
1266         mobj = re.match(self._VALID_URL, url)
1267         if mobj is None:
1268             raise ExtractorError(u'Invalid URL: %s' % url)
1269         playlist_id = mobj.group('id')
1270
1271         webpage = self._download_webpage(url, playlist_id)
1272
1273         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1274         data = json.loads(json_like)
1275
1276         session = str(random.randint(0, 1000000000))
1277         mix_id = data['id']
1278         track_count = data['tracks_count']
1279         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1280         next_url = first_url
1281         res = []
1282         for i in itertools.count():
1283             api_json = self._download_webpage(next_url, playlist_id,
1284                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1285                 errnote=u'Failed to download song information')
1286             api_data = json.loads(api_json)
1287             track_data = api_data[u'set']['track']
1288             info = {
1289                 'id': track_data['id'],
1290                 'url': track_data['track_file_stream_url'],
1291                 'title': track_data['performer'] + u' - ' + track_data['name'],
1292                 'raw_title': track_data['name'],
1293                 'uploader_id': data['user']['login'],
1294                 'ext': 'm4a',
1295             }
1296             res.append(info)
1297             if api_data['set']['at_last_track']:
1298                 break
1299             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1300         return res
1301
1302 class KeekIE(InfoExtractor):
1303     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1304     IE_NAME = u'keek'
1305
1306     def _real_extract(self, url):
1307         m = re.match(self._VALID_URL, url)
1308         video_id = m.group('videoID')
1309
1310         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1311         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1312         webpage = self._download_webpage(url, video_id)
1313
1314         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1315             webpage, u'title')
1316
1317         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1318             webpage, u'uploader', fatal=False)
1319
1320         info = {
1321                 'id': video_id,
1322                 'url': video_url,
1323                 'ext': 'mp4',
1324                 'title': video_title,
1325                 'thumbnail': thumbnail,
1326                 'uploader': uploader
1327         }
1328         return [info]
1329
1330 class TEDIE(InfoExtractor):
1331     _VALID_URL=r'''http://www\.ted\.com/
1332                    (
1333                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1334                         |
1335                         ((?P<type_talk>talks)) # We have a simple talk
1336                    )
1337                    (/lang/(.*?))? # The url may contain the language
1338                    /(?P<name>\w+) # Here goes the name and then ".html"
1339                    '''
1340
1341     @classmethod
1342     def suitable(cls, url):
1343         """Receives a URL and returns True if suitable for this IE."""
1344         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1345
1346     def _real_extract(self, url):
1347         m=re.match(self._VALID_URL, url, re.VERBOSE)
1348         if m.group('type_talk'):
1349             return [self._talk_info(url)]
1350         else :
1351             playlist_id=m.group('playlist_id')
1352             name=m.group('name')
1353             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1354             return [self._playlist_videos_info(url,name,playlist_id)]
1355
1356     def _playlist_videos_info(self,url,name,playlist_id=0):
1357         '''Returns the videos of the playlist'''
1358         video_RE=r'''
1359                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1360                      ([.\s]*?)data-playlist_item_id="(\d+)"
1361                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1362                      '''
1363         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1364         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1365         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1366         m_names=re.finditer(video_name_RE,webpage)
1367
1368         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1369                                                  webpage, 'playlist title')
1370
1371         playlist_entries = []
1372         for m_video, m_name in zip(m_videos,m_names):
1373             video_id=m_video.group('video_id')
1374             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1375             playlist_entries.append(self.url_result(talk_url, 'TED'))
1376         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1377
1378     def _talk_info(self, url, video_id=0):
1379         """Return the video for the talk in the url"""
1380         m = re.match(self._VALID_URL, url,re.VERBOSE)
1381         video_name = m.group('name')
1382         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1383         self.report_extraction(video_name)
1384         # If the url includes the language we get the title translated
1385         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1386                                         webpage, 'title')
1387         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1388                                     webpage, 'json data')
1389         info = json.loads(json_data)
1390         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1391                                        webpage, 'description', flags = re.DOTALL)
1392         
1393         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1394                                        webpage, 'thumbnail')
1395         info = {
1396                 'id': info['id'],
1397                 'url': info['htmlStreams'][-1]['file'],
1398                 'ext': 'mp4',
1399                 'title': title,
1400                 'thumbnail': thumbnail,
1401                 'description': desc,
1402                 }
1403         return info
1404
1405 class MySpassIE(InfoExtractor):
1406     _VALID_URL = r'http://www.myspass.de/.*'
1407
1408     def _real_extract(self, url):
1409         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1410
1411         # video id is the last path element of the URL
1412         # usually there is a trailing slash, so also try the second but last
1413         url_path = compat_urllib_parse_urlparse(url).path
1414         url_parent_path, video_id = os.path.split(url_path)
1415         if not video_id:
1416             _, video_id = os.path.split(url_parent_path)
1417
1418         # get metadata
1419         metadata_url = META_DATA_URL_TEMPLATE % video_id
1420         metadata_text = self._download_webpage(metadata_url, video_id)
1421         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1422
1423         # extract values from metadata
1424         url_flv_el = metadata.find('url_flv')
1425         if url_flv_el is None:
1426             raise ExtractorError(u'Unable to extract download url')
1427         video_url = url_flv_el.text
1428         extension = os.path.splitext(video_url)[1][1:]
1429         title_el = metadata.find('title')
1430         if title_el is None:
1431             raise ExtractorError(u'Unable to extract title')
1432         title = title_el.text
1433         format_id_el = metadata.find('format_id')
1434         if format_id_el is None:
1435             format = ext
1436         else:
1437             format = format_id_el.text
1438         description_el = metadata.find('description')
1439         if description_el is not None:
1440             description = description_el.text
1441         else:
1442             description = None
1443         imagePreview_el = metadata.find('imagePreview')
1444         if imagePreview_el is not None:
1445             thumbnail = imagePreview_el.text
1446         else:
1447             thumbnail = None
1448         info = {
1449             'id': video_id,
1450             'url': video_url,
1451             'title': title,
1452             'ext': extension,
1453             'format': format,
1454             'thumbnail': thumbnail,
1455             'description': description
1456         }
1457         return [info]
1458
1459 class SpiegelIE(InfoExtractor):
1460     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1461
1462     def _real_extract(self, url):
1463         m = re.match(self._VALID_URL, url)
1464         video_id = m.group('videoID')
1465
1466         webpage = self._download_webpage(url, video_id)
1467
1468         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1469             webpage, u'title')
1470
1471         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1472         xml_code = self._download_webpage(xml_url, video_id,
1473                     note=u'Downloading XML', errnote=u'Failed to download XML')
1474
1475         idoc = xml.etree.ElementTree.fromstring(xml_code)
1476         last_type = idoc[-1]
1477         filename = last_type.findall('./filename')[0].text
1478         duration = float(last_type.findall('./duration')[0].text)
1479
1480         video_url = 'http://video2.spiegel.de/flash/' + filename
1481         video_ext = filename.rpartition('.')[2]
1482         info = {
1483             'id': video_id,
1484             'url': video_url,
1485             'ext': video_ext,
1486             'title': video_title,
1487             'duration': duration,
1488         }
1489         return [info]
1490
1491 class LiveLeakIE(InfoExtractor):
1492
1493     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1494     IE_NAME = u'liveleak'
1495
1496     def _real_extract(self, url):
1497         mobj = re.match(self._VALID_URL, url)
1498         if mobj is None:
1499             raise ExtractorError(u'Invalid URL: %s' % url)
1500
1501         video_id = mobj.group('video_id')
1502
1503         webpage = self._download_webpage(url, video_id)
1504
1505         video_url = self._search_regex(r'file: "(.*?)",',
1506             webpage, u'video URL')
1507
1508         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1509             webpage, u'title').replace('LiveLeak.com -', '').strip()
1510
1511         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1512             webpage, u'description', fatal=False)
1513
1514         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1515             webpage, u'uploader', fatal=False)
1516
1517         info = {
1518             'id':  video_id,
1519             'url': video_url,
1520             'ext': 'mp4',
1521             'title': video_title,
1522             'description': video_description,
1523             'uploader': video_uploader
1524         }
1525
1526         return [info]
1527
1528
1529
1530 class TumblrIE(InfoExtractor):
1531     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1532
1533     def _real_extract(self, url):
1534         m_url = re.match(self._VALID_URL, url)
1535         video_id = m_url.group('id')
1536         blog = m_url.group('blog_name')
1537
1538         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1539         webpage = self._download_webpage(url, video_id)
1540
1541         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1542         video = re.search(re_video, webpage)
1543         if video is None:
1544            raise ExtractorError(u'Unable to extract video')
1545         video_url = video.group('video_url')
1546         ext = video.group('ext')
1547
1548         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1549             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1550         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1551
1552         # The only place where you can get a title, it's not complete,
1553         # but searching in other places doesn't work for all videos
1554         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1555             webpage, u'title', flags=re.DOTALL)
1556
1557         return [{'id': video_id,
1558                  'url': video_url,
1559                  'title': video_title,
1560                  'thumbnail': video_thumbnail,
1561                  'ext': ext
1562                  }]
1563
1564 class BandcampIE(InfoExtractor):
1565     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1566
1567     def _real_extract(self, url):
1568         mobj = re.match(self._VALID_URL, url)
1569         title = mobj.group('title')
1570         webpage = self._download_webpage(url, title)
1571         # We get the link to the free download page
1572         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1573         if m_download is None:
1574             raise ExtractorError(u'No free songs found')
1575
1576         download_link = m_download.group(1)
1577         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1578                        webpage, re.MULTILINE|re.DOTALL).group('id')
1579
1580         download_webpage = self._download_webpage(download_link, id,
1581                                                   'Downloading free downloads page')
1582         # We get the dictionary of the track from some javascrip code
1583         info = re.search(r'items: (.*?),$',
1584                          download_webpage, re.MULTILINE).group(1)
1585         info = json.loads(info)[0]
1586         # We pick mp3-320 for now, until format selection can be easily implemented.
1587         mp3_info = info[u'downloads'][u'mp3-320']
1588         # If we try to use this url it says the link has expired
1589         initial_url = mp3_info[u'url']
1590         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1591         m_url = re.match(re_url, initial_url)
1592         #We build the url we will use to get the final track url
1593         # This url is build in Bandcamp in the script download_bunde_*.js
1594         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1595         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1596         # If we could correctly generate the .rand field the url would be
1597         #in the "download_url" key
1598         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1599
1600         track_info = {'id':id,
1601                       'title' : info[u'title'],
1602                       'ext' :   'mp3',
1603                       'url' :   final_url,
1604                       'thumbnail' : info[u'thumb_url'],
1605                       'uploader' :  info[u'artist']
1606                       }
1607
1608         return [track_info]
1609
1610 class RedTubeIE(InfoExtractor):
1611     """Information Extractor for redtube"""
1612     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1613
1614     def _real_extract(self,url):
1615         mobj = re.match(self._VALID_URL, url)
1616         if mobj is None:
1617             raise ExtractorError(u'Invalid URL: %s' % url)
1618
1619         video_id = mobj.group('id')
1620         video_extension = 'mp4'        
1621         webpage = self._download_webpage(url, video_id)
1622
1623         self.report_extraction(video_id)
1624
1625         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1626             webpage, u'video URL')
1627
1628         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1629             webpage, u'title')
1630
1631         return [{
1632             'id':       video_id,
1633             'url':      video_url,
1634             'ext':      video_extension,
1635             'title':    video_title,
1636         }]
1637         
1638 class InaIE(InfoExtractor):
1639     """Information Extractor for Ina.fr"""
1640     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1641
1642     def _real_extract(self,url):
1643         mobj = re.match(self._VALID_URL, url)
1644
1645         video_id = mobj.group('id')
1646         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1647         video_extension = 'mp4'
1648         webpage = self._download_webpage(mrss_url, video_id)
1649
1650         self.report_extraction(video_id)
1651
1652         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1653             webpage, u'video URL')
1654
1655         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1656             webpage, u'title')
1657
1658         return [{
1659             'id':       video_id,
1660             'url':      video_url,
1661             'ext':      video_extension,
1662             'title':    video_title,
1663         }]
1664
1665 class HowcastIE(InfoExtractor):
1666     """Information Extractor for Howcast.com"""
1667     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1668
1669     def _real_extract(self, url):
1670         mobj = re.match(self._VALID_URL, url)
1671
1672         video_id = mobj.group('id')
1673         webpage_url = 'http://www.howcast.com/videos/' + video_id
1674         webpage = self._download_webpage(webpage_url, video_id)
1675
1676         self.report_extraction(video_id)
1677
1678         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1679             webpage, u'video URL')
1680
1681         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1682             webpage, u'title')
1683
1684         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1685             webpage, u'description', fatal=False)
1686
1687         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1688             webpage, u'thumbnail', fatal=False)
1689
1690         return [{
1691             'id':       video_id,
1692             'url':      video_url,
1693             'ext':      'mp4',
1694             'title':    video_title,
1695             'description': video_description,
1696             'thumbnail': thumbnail,
1697         }]
1698
1699 class VineIE(InfoExtractor):
1700     """Information Extractor for Vine.co"""
1701     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1702
1703     def _real_extract(self, url):
1704         mobj = re.match(self._VALID_URL, url)
1705
1706         video_id = mobj.group('id')
1707         webpage_url = 'https://vine.co/v/' + video_id
1708         webpage = self._download_webpage(webpage_url, video_id)
1709
1710         self.report_extraction(video_id)
1711
1712         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1713             webpage, u'video URL')
1714
1715         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1716             webpage, u'title')
1717
1718         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1719             webpage, u'thumbnail', fatal=False)
1720
1721         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1722             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1723
1724         return [{
1725             'id':        video_id,
1726             'url':       video_url,
1727             'ext':       'mp4',
1728             'title':     video_title,
1729             'thumbnail': thumbnail,
1730             'uploader':  uploader,
1731         }]
1732
1733 class FlickrIE(InfoExtractor):
1734     """Information Extractor for Flickr videos"""
1735     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1736
1737     def _real_extract(self, url):
1738         mobj = re.match(self._VALID_URL, url)
1739
1740         video_id = mobj.group('id')
1741         video_uploader_id = mobj.group('uploader_id')
1742         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1743         webpage = self._download_webpage(webpage_url, video_id)
1744
1745         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1746
1747         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1748         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1749
1750         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1751             first_xml, u'node_id')
1752
1753         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1754         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1755
1756         self.report_extraction(video_id)
1757
1758         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1759         if mobj is None:
1760             raise ExtractorError(u'Unable to extract video url')
1761         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1762
1763         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1764             webpage, u'video title')
1765
1766         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1767             webpage, u'description', fatal=False)
1768
1769         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1770             webpage, u'thumbnail', fatal=False)
1771
1772         return [{
1773             'id':          video_id,
1774             'url':         video_url,
1775             'ext':         'mp4',
1776             'title':       video_title,
1777             'description': video_description,
1778             'thumbnail':   thumbnail,
1779             'uploader_id': video_uploader_id,
1780         }]
1781
1782 class TeamcocoIE(InfoExtractor):
1783     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1784
1785     def _real_extract(self, url):
1786         mobj = re.match(self._VALID_URL, url)
1787         if mobj is None:
1788             raise ExtractorError(u'Invalid URL: %s' % url)
1789         url_title = mobj.group('url_title')
1790         webpage = self._download_webpage(url, url_title)
1791
1792         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1793             webpage, u'video id')
1794
1795         self.report_extraction(video_id)
1796
1797         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1798             webpage, u'title')
1799
1800         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1801             webpage, u'thumbnail', fatal=False)
1802
1803         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1804             webpage, u'description', fatal=False)
1805
1806         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1807         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1808
1809         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1810             data, u'video URL')
1811
1812         return [{
1813             'id':          video_id,
1814             'url':         video_url,
1815             'ext':         'mp4',
1816             'title':       video_title,
1817             'thumbnail':   thumbnail,
1818             'description': video_description,
1819         }]
1820
1821 class XHamsterIE(InfoExtractor):
1822     """Information Extractor for xHamster"""
1823     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1824
1825     def _real_extract(self,url):
1826         mobj = re.match(self._VALID_URL, url)
1827
1828         video_id = mobj.group('id')
1829         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1830         webpage = self._download_webpage(mrss_url, video_id)
1831
1832         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1833         if mobj is None:
1834             raise ExtractorError(u'Unable to extract media URL')
1835         if len(mobj.group('server')) == 0:
1836             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1837         else:
1838             video_url = mobj.group('server')+'/key='+mobj.group('file')
1839         video_extension = video_url.split('.')[-1]
1840
1841         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1842             webpage, u'title')
1843
1844         # Can't see the description anywhere in the UI
1845         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1846         #     webpage, u'description', fatal=False)
1847         # if video_description: video_description = unescapeHTML(video_description)
1848
1849         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1850         if mobj:
1851             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1852         else:
1853             video_upload_date = None
1854             self._downloader.report_warning(u'Unable to extract upload date')
1855
1856         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1857             webpage, u'uploader id', default=u'anonymous')
1858
1859         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1860             webpage, u'thumbnail', fatal=False)
1861
1862         return [{
1863             'id':       video_id,
1864             'url':      video_url,
1865             'ext':      video_extension,
1866             'title':    video_title,
1867             # 'description': video_description,
1868             'upload_date': video_upload_date,
1869             'uploader_id': video_uploader_id,
1870             'thumbnail': video_thumbnail
1871         }]
1872
1873 class HypemIE(InfoExtractor):
1874     """Information Extractor for hypem"""
1875     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1876
1877     def _real_extract(self, url):
1878         mobj = re.match(self._VALID_URL, url)
1879         if mobj is None:
1880             raise ExtractorError(u'Invalid URL: %s' % url)
1881         track_id = mobj.group(1)
1882
1883         data = { 'ax': 1, 'ts': time.time() }
1884         data_encoded = compat_urllib_parse.urlencode(data)
1885         complete_url = url + "?" + data_encoded
1886         request = compat_urllib_request.Request(complete_url)
1887         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1888         cookie = urlh.headers.get('Set-Cookie', '')
1889
1890         self.report_extraction(track_id)
1891
1892         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1893             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1894         try:
1895             track_list = json.loads(html_tracks)
1896             track = track_list[u'tracks'][0]
1897         except ValueError:
1898             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1899
1900         key = track[u"key"]
1901         track_id = track[u"id"]
1902         artist = track[u"artist"]
1903         title = track[u"song"]
1904
1905         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1906         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1907         request.add_header('cookie', cookie)
1908         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1909         try:
1910             song_data = json.loads(song_data_json)
1911         except ValueError:
1912             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1913         final_url = song_data[u"url"]
1914
1915         return [{
1916             'id':       track_id,
1917             'url':      final_url,
1918             'ext':      "mp3",
1919             'title':    title,
1920             'artist':   artist,
1921         }]
1922
1923 class Vbox7IE(InfoExtractor):
1924     """Information Extractor for Vbox7"""
1925     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1926
1927     def _real_extract(self,url):
1928         mobj = re.match(self._VALID_URL, url)
1929         if mobj is None:
1930             raise ExtractorError(u'Invalid URL: %s' % url)
1931         video_id = mobj.group(1)
1932
1933         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1934         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1935         redirect_url = urlh.geturl() + new_location
1936         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1937
1938         title = self._html_search_regex(r'<title>(.*)</title>',
1939             webpage, u'title').split('/')[0].strip()
1940
1941         ext = "flv"
1942         info_url = "http://vbox7.com/play/magare.do"
1943         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1944         info_request = compat_urllib_request.Request(info_url, data)
1945         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1946         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1947         if info_response is None:
1948             raise ExtractorError(u'Unable to extract the media url')
1949         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1950
1951         return [{
1952             'id':        video_id,
1953             'url':       final_url,
1954             'ext':       ext,
1955             'title':     title,
1956             'thumbnail': thumbnail_url,
1957         }]
1958
1959
1960 def gen_extractors():
1961     """ Return a list of an instance of every supported extractor.
1962     The order does matter; the first extractor matched is the one handling the URL.
1963     """
1964     return [
1965         YoutubePlaylistIE(),
1966         YoutubeChannelIE(),
1967         YoutubeUserIE(),
1968         YoutubeSearchIE(),
1969         YoutubeIE(),
1970         MetacafeIE(),
1971         DailymotionIE(),
1972         GoogleSearchIE(),
1973         PhotobucketIE(),
1974         YahooIE(),
1975         YahooSearchIE(),
1976         DepositFilesIE(),
1977         FacebookIE(),
1978         BlipTVIE(),
1979         BlipTVUserIE(),
1980         VimeoIE(),
1981         MyVideoIE(),
1982         ComedyCentralIE(),
1983         EscapistIE(),
1984         CollegeHumorIE(),
1985         XVideosIE(),
1986         SoundcloudSetIE(),
1987         SoundcloudIE(),
1988         InfoQIE(),
1989         MixcloudIE(),
1990         StanfordOpenClassroomIE(),
1991         MTVIE(),
1992         YoukuIE(),
1993         XNXXIE(),
1994         YouJizzIE(),
1995         PornotubeIE(),
1996         YouPornIE(),
1997         GooglePlusIE(),
1998         ArteTvIE(),
1999         NBAIE(),
2000         WorldStarHipHopIE(),
2001         JustinTVIE(),
2002         FunnyOrDieIE(),
2003         SteamIE(),
2004         UstreamIE(),
2005         RBMARadioIE(),
2006         EightTracksIE(),
2007         KeekIE(),
2008         TEDIE(),
2009         MySpassIE(),
2010         SpiegelIE(),
2011         LiveLeakIE(),
2012         ARDIE(),
2013         ZDFIE(),
2014         TumblrIE(),
2015         BandcampIE(),
2016         RedTubeIE(),
2017         InaIE(),
2018         HowcastIE(),
2019         VineIE(),
2020         FlickrIE(),
2021         TeamcocoIE(),
2022         XHamsterIE(),
2023         HypemIE(),
2024         Vbox7IE(),
2025         GametrailersIE(),
2026         StatigramIE(),
2027         GenericIE()
2028     ]
2029
2030 def get_info_extractor(ie_name):
2031     """Returns the info extractor class with the given ie_name"""
2032     return globals()[ie_name+'IE']