Move Escapist into its own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.depositfiles import DepositFilesIE
27 from .extractor.escapist import EscapistIE
28 from .extractor.facebook import FacebookIE
29 from .extractor.gametrailers import GametrailersIE
30 from .extractor.generic import GenericIE
31 from .extractor.googleplus import GooglePlusIE
32 from .extractor.googlesearch import GoogleSearchIE
33 from .extractor.metacafe import MetacafeIE
34 from .extractor.myvideo import MyVideoIE
35 from .extractor.statigram import StatigramIE
36 from .extractor.photobucket import PhotobucketIE
37 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
38 from .extractor.vimeo import VimeoIE
39 from .extractor.yahoo import YahooIE, YahooSearchIE
40 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
41 from .extractor.zdf import ZDFIE
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 class CollegeHumorIE(InfoExtractor):
72     """Information extractor for collegehumor.com"""
73
74     _WORKING = False
75     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
76     IE_NAME = u'collegehumor'
77
78     def report_manifest(self, video_id):
79         """Report information extraction."""
80         self.to_screen(u'%s: Downloading XML manifest' % video_id)
81
82     def _real_extract(self, url):
83         mobj = re.match(self._VALID_URL, url)
84         if mobj is None:
85             raise ExtractorError(u'Invalid URL: %s' % url)
86         video_id = mobj.group('videoid')
87
88         info = {
89             'id': video_id,
90             'uploader': None,
91             'upload_date': None,
92         }
93
94         self.report_extraction(video_id)
95         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
96         try:
97             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
98         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
99             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
100
101         mdoc = xml.etree.ElementTree.fromstring(metaXml)
102         try:
103             videoNode = mdoc.findall('./video')[0]
104             info['description'] = videoNode.findall('./description')[0].text
105             info['title'] = videoNode.findall('./caption')[0].text
106             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
107             manifest_url = videoNode.findall('./file')[0].text
108         except IndexError:
109             raise ExtractorError(u'Invalid metadata XML file')
110
111         manifest_url += '?hdcore=2.10.3'
112         self.report_manifest(video_id)
113         try:
114             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
115         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
116             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
117
118         adoc = xml.etree.ElementTree.fromstring(manifestXml)
119         try:
120             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
121             node_id = media_node.attrib['url']
122             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
123         except IndexError as err:
124             raise ExtractorError(u'Invalid manifest file')
125
126         url_pr = compat_urllib_parse_urlparse(manifest_url)
127         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
128
129         info['url'] = url
130         info['ext'] = 'f4f'
131         return [info]
132
133
134 class XVideosIE(InfoExtractor):
135     """Information extractor for xvideos.com"""
136
137     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
138     IE_NAME = u'xvideos'
139
140     def _real_extract(self, url):
141         mobj = re.match(self._VALID_URL, url)
142         if mobj is None:
143             raise ExtractorError(u'Invalid URL: %s' % url)
144         video_id = mobj.group(1)
145
146         webpage = self._download_webpage(url, video_id)
147
148         self.report_extraction(video_id)
149
150         # Extract video URL
151         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
152             webpage, u'video URL'))
153
154         # Extract title
155         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
156             webpage, u'title')
157
158         # Extract video thumbnail
159         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
160             webpage, u'thumbnail', fatal=False)
161
162         info = {
163             'id': video_id,
164             'url': video_url,
165             'uploader': None,
166             'upload_date': None,
167             'title': video_title,
168             'ext': 'flv',
169             'thumbnail': video_thumbnail,
170             'description': None,
171         }
172
173         return [info]
174
175
176
177
178 class InfoQIE(InfoExtractor):
179     """Information extractor for infoq.com"""
180     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
181
182     def _real_extract(self, url):
183         mobj = re.match(self._VALID_URL, url)
184         if mobj is None:
185             raise ExtractorError(u'Invalid URL: %s' % url)
186
187         webpage = self._download_webpage(url, video_id=url)
188         self.report_extraction(url)
189
190         # Extract video URL
191         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
192         if mobj is None:
193             raise ExtractorError(u'Unable to extract video url')
194         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
195         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
196
197         # Extract title
198         video_title = self._search_regex(r'contentTitle = "(.*?)";',
199             webpage, u'title')
200
201         # Extract description
202         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
203             webpage, u'description', fatal=False)
204
205         video_filename = video_url.split('/')[-1]
206         video_id, extension = video_filename.split('.')
207
208         info = {
209             'id': video_id,
210             'url': video_url,
211             'uploader': None,
212             'upload_date': None,
213             'title': video_title,
214             'ext': extension, # Extension is always(?) mp4, but seems to be flv
215             'thumbnail': None,
216             'description': video_description,
217         }
218
219         return [info]
220
221 class MixcloudIE(InfoExtractor):
222     """Information extractor for www.mixcloud.com"""
223
224     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
225     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
226     IE_NAME = u'mixcloud'
227
228     def report_download_json(self, file_id):
229         """Report JSON download."""
230         self.to_screen(u'Downloading json')
231
232     def get_urls(self, jsonData, fmt, bitrate='best'):
233         """Get urls from 'audio_formats' section in json"""
234         file_url = None
235         try:
236             bitrate_list = jsonData[fmt]
237             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
238                 bitrate = max(bitrate_list) # select highest
239
240             url_list = jsonData[fmt][bitrate]
241         except TypeError: # we have no bitrate info.
242             url_list = jsonData[fmt]
243         return url_list
244
245     def check_urls(self, url_list):
246         """Returns 1st active url from list"""
247         for url in url_list:
248             try:
249                 compat_urllib_request.urlopen(url)
250                 return url
251             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
252                 url = None
253
254         return None
255
256     def _print_formats(self, formats):
257         print('Available formats:')
258         for fmt in formats.keys():
259             for b in formats[fmt]:
260                 try:
261                     ext = formats[fmt][b][0]
262                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
263                 except TypeError: # we have no bitrate info
264                     ext = formats[fmt][0]
265                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
266                     break
267
268     def _real_extract(self, url):
269         mobj = re.match(self._VALID_URL, url)
270         if mobj is None:
271             raise ExtractorError(u'Invalid URL: %s' % url)
272         # extract uploader & filename from url
273         uploader = mobj.group(1).decode('utf-8')
274         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
275
276         # construct API request
277         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
278         # retrieve .json file with links to files
279         request = compat_urllib_request.Request(file_url)
280         try:
281             self.report_download_json(file_url)
282             jsonData = compat_urllib_request.urlopen(request).read()
283         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
284             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
285
286         # parse JSON
287         json_data = json.loads(jsonData)
288         player_url = json_data['player_swf_url']
289         formats = dict(json_data['audio_formats'])
290
291         req_format = self._downloader.params.get('format', None)
292         bitrate = None
293
294         if self._downloader.params.get('listformats', None):
295             self._print_formats(formats)
296             return
297
298         if req_format is None or req_format == 'best':
299             for format_param in formats.keys():
300                 url_list = self.get_urls(formats, format_param)
301                 # check urls
302                 file_url = self.check_urls(url_list)
303                 if file_url is not None:
304                     break # got it!
305         else:
306             if req_format not in formats:
307                 raise ExtractorError(u'Format is not available')
308
309             url_list = self.get_urls(formats, req_format)
310             file_url = self.check_urls(url_list)
311             format_param = req_format
312
313         return [{
314             'id': file_id.decode('utf-8'),
315             'url': file_url.decode('utf-8'),
316             'uploader': uploader.decode('utf-8'),
317             'upload_date': None,
318             'title': json_data['name'],
319             'ext': file_url.split('.')[-1].decode('utf-8'),
320             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
321             'thumbnail': json_data['thumbnail_url'],
322             'description': json_data['description'],
323             'player_url': player_url.decode('utf-8'),
324         }]
325
326 class StanfordOpenClassroomIE(InfoExtractor):
327     """Information extractor for Stanford's Open ClassRoom"""
328
329     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
330     IE_NAME = u'stanfordoc'
331
332     def _real_extract(self, url):
333         mobj = re.match(self._VALID_URL, url)
334         if mobj is None:
335             raise ExtractorError(u'Invalid URL: %s' % url)
336
337         if mobj.group('course') and mobj.group('video'): # A specific video
338             course = mobj.group('course')
339             video = mobj.group('video')
340             info = {
341                 'id': course + '_' + video,
342                 'uploader': None,
343                 'upload_date': None,
344             }
345
346             self.report_extraction(info['id'])
347             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
348             xmlUrl = baseUrl + video + '.xml'
349             try:
350                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
351             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
352                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
353             mdoc = xml.etree.ElementTree.fromstring(metaXml)
354             try:
355                 info['title'] = mdoc.findall('./title')[0].text
356                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
357             except IndexError:
358                 raise ExtractorError(u'Invalid metadata XML file')
359             info['ext'] = info['url'].rpartition('.')[2]
360             return [info]
361         elif mobj.group('course'): # A course page
362             course = mobj.group('course')
363             info = {
364                 'id': course,
365                 'type': 'playlist',
366                 'uploader': None,
367                 'upload_date': None,
368             }
369
370             coursepage = self._download_webpage(url, info['id'],
371                                         note='Downloading course info page',
372                                         errnote='Unable to download course info page')
373
374             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
375
376             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
377                 coursepage, u'description', fatal=False)
378
379             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
380             info['list'] = [
381                 {
382                     'type': 'reference',
383                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
384                 }
385                     for vpage in links]
386             results = []
387             for entry in info['list']:
388                 assert entry['type'] == 'reference'
389                 results += self.extract(entry['url'])
390             return results
391         else: # Root page
392             info = {
393                 'id': 'Stanford OpenClassroom',
394                 'type': 'playlist',
395                 'uploader': None,
396                 'upload_date': None,
397             }
398
399             self.report_download_webpage(info['id'])
400             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
401             try:
402                 rootpage = compat_urllib_request.urlopen(rootURL).read()
403             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
405
406             info['title'] = info['id']
407
408             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
409             info['list'] = [
410                 {
411                     'type': 'reference',
412                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
413                 }
414                     for cpage in links]
415
416             results = []
417             for entry in info['list']:
418                 assert entry['type'] == 'reference'
419                 results += self.extract(entry['url'])
420             return results
421
422 class MTVIE(InfoExtractor):
423     """Information extractor for MTV.com"""
424
425     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
426     IE_NAME = u'mtv'
427
428     def _real_extract(self, url):
429         mobj = re.match(self._VALID_URL, url)
430         if mobj is None:
431             raise ExtractorError(u'Invalid URL: %s' % url)
432         if not mobj.group('proto'):
433             url = 'http://' + url
434         video_id = mobj.group('videoid')
435
436         webpage = self._download_webpage(url, video_id)
437
438         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
439             webpage, u'song name', fatal=False)
440
441         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
442             webpage, u'title')
443
444         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
445             webpage, u'mtvn_uri', fatal=False)
446
447         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
448             webpage, u'content id', fatal=False)
449
450         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
451         self.report_extraction(video_id)
452         request = compat_urllib_request.Request(videogen_url)
453         try:
454             metadataXml = compat_urllib_request.urlopen(request).read()
455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
456             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
457
458         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
459         renditions = mdoc.findall('.//rendition')
460
461         # For now, always pick the highest quality.
462         rendition = renditions[-1]
463
464         try:
465             _,_,ext = rendition.attrib['type'].partition('/')
466             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
467             video_url = rendition.find('./src').text
468         except KeyError:
469             raise ExtractorError('Invalid rendition field.')
470
471         info = {
472             'id': video_id,
473             'url': video_url,
474             'uploader': performer,
475             'upload_date': None,
476             'title': video_title,
477             'ext': ext,
478             'format': format,
479         }
480
481         return [info]
482
483
484 class YoukuIE(InfoExtractor):
485     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
486
487     def _gen_sid(self):
488         nowTime = int(time.time() * 1000)
489         random1 = random.randint(1000,1998)
490         random2 = random.randint(1000,9999)
491
492         return "%d%d%d" %(nowTime,random1,random2)
493
494     def _get_file_ID_mix_string(self, seed):
495         mixed = []
496         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
497         seed = float(seed)
498         for i in range(len(source)):
499             seed  =  (seed * 211 + 30031 ) % 65536
500             index  =  math.floor(seed / 65536 * len(source) )
501             mixed.append(source[int(index)])
502             source.remove(source[int(index)])
503         #return ''.join(mixed)
504         return mixed
505
506     def _get_file_id(self, fileId, seed):
507         mixed = self._get_file_ID_mix_string(seed)
508         ids = fileId.split('*')
509         realId = []
510         for ch in ids:
511             if ch:
512                 realId.append(mixed[int(ch)])
513         return ''.join(realId)
514
515     def _real_extract(self, url):
516         mobj = re.match(self._VALID_URL, url)
517         if mobj is None:
518             raise ExtractorError(u'Invalid URL: %s' % url)
519         video_id = mobj.group('ID')
520
521         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
522
523         jsondata = self._download_webpage(info_url, video_id)
524
525         self.report_extraction(video_id)
526         try:
527             config = json.loads(jsondata)
528
529             video_title =  config['data'][0]['title']
530             seed = config['data'][0]['seed']
531
532             format = self._downloader.params.get('format', None)
533             supported_format = list(config['data'][0]['streamfileids'].keys())
534
535             if format is None or format == 'best':
536                 if 'hd2' in supported_format:
537                     format = 'hd2'
538                 else:
539                     format = 'flv'
540                 ext = u'flv'
541             elif format == 'worst':
542                 format = 'mp4'
543                 ext = u'mp4'
544             else:
545                 format = 'flv'
546                 ext = u'flv'
547
548
549             fileid = config['data'][0]['streamfileids'][format]
550             keys = [s['k'] for s in config['data'][0]['segs'][format]]
551         except (UnicodeDecodeError, ValueError, KeyError):
552             raise ExtractorError(u'Unable to extract info section')
553
554         files_info=[]
555         sid = self._gen_sid()
556         fileid = self._get_file_id(fileid, seed)
557
558         #column 8,9 of fileid represent the segment number
559         #fileid[7:9] should be changed
560         for index, key in enumerate(keys):
561
562             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
563             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
564
565             info = {
566                 'id': '%s_part%02d' % (video_id, index),
567                 'url': download_url,
568                 'uploader': None,
569                 'upload_date': None,
570                 'title': video_title,
571                 'ext': ext,
572             }
573             files_info.append(info)
574
575         return files_info
576
577
578 class XNXXIE(InfoExtractor):
579     """Information extractor for xnxx.com"""
580
581     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
582     IE_NAME = u'xnxx'
583     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
584     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
585     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
586
587     def _real_extract(self, url):
588         mobj = re.match(self._VALID_URL, url)
589         if mobj is None:
590             raise ExtractorError(u'Invalid URL: %s' % url)
591         video_id = mobj.group(1)
592
593         # Get webpage content
594         webpage = self._download_webpage(url, video_id)
595
596         video_url = self._search_regex(self.VIDEO_URL_RE,
597             webpage, u'video URL')
598         video_url = compat_urllib_parse.unquote(video_url)
599
600         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
601             webpage, u'title')
602
603         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
604             webpage, u'thumbnail', fatal=False)
605
606         return [{
607             'id': video_id,
608             'url': video_url,
609             'uploader': None,
610             'upload_date': None,
611             'title': video_title,
612             'ext': 'flv',
613             'thumbnail': video_thumbnail,
614             'description': None,
615         }]
616
617
618
619 class NBAIE(InfoExtractor):
620     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
621     IE_NAME = u'nba'
622
623     def _real_extract(self, url):
624         mobj = re.match(self._VALID_URL, url)
625         if mobj is None:
626             raise ExtractorError(u'Invalid URL: %s' % url)
627
628         video_id = mobj.group(1)
629
630         webpage = self._download_webpage(url, video_id)
631
632         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
633
634         shortened_video_id = video_id.rpartition('/')[2]
635         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
636             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
637
638         # It isn't there in the HTML it returns to us
639         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
640
641         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
642
643         info = {
644             'id': shortened_video_id,
645             'url': video_url,
646             'ext': 'mp4',
647             'title': title,
648             # 'uploader_date': uploader_date,
649             'description': description,
650         }
651         return [info]
652
653 class JustinTVIE(InfoExtractor):
654     """Information extractor for justin.tv and twitch.tv"""
655     # TODO: One broadcast may be split into multiple videos. The key
656     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
657     # starts at 1 and increases. Can we treat all parts as one video?
658
659     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
660         (?:
661             (?P<channelid>[^/]+)|
662             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
663             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
664         )
665         /?(?:\#.*)?$
666         """
667     _JUSTIN_PAGE_LIMIT = 100
668     IE_NAME = u'justin.tv'
669
670     def report_download_page(self, channel, offset):
671         """Report attempt to download a single page of videos."""
672         self.to_screen(u'%s: Downloading video information from %d to %d' %
673                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
674
675     # Return count of items, list of *valid* items
676     def _parse_page(self, url, video_id):
677         webpage = self._download_webpage(url, video_id,
678                                          u'Downloading video info JSON',
679                                          u'unable to download video info JSON')
680
681         response = json.loads(webpage)
682         if type(response) != list:
683             error_text = response.get('error', 'unknown error')
684             raise ExtractorError(u'Justin.tv API: %s' % error_text)
685         info = []
686         for clip in response:
687             video_url = clip['video_file_url']
688             if video_url:
689                 video_extension = os.path.splitext(video_url)[1][1:]
690                 video_date = re.sub('-', '', clip['start_time'][:10])
691                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
692                 video_id = clip['id']
693                 video_title = clip.get('title', video_id)
694                 info.append({
695                     'id': video_id,
696                     'url': video_url,
697                     'title': video_title,
698                     'uploader': clip.get('channel_name', video_uploader_id),
699                     'uploader_id': video_uploader_id,
700                     'upload_date': video_date,
701                     'ext': video_extension,
702                 })
703         return (len(response), info)
704
705     def _real_extract(self, url):
706         mobj = re.match(self._VALID_URL, url)
707         if mobj is None:
708             raise ExtractorError(u'invalid URL: %s' % url)
709
710         api_base = 'http://api.justin.tv'
711         paged = False
712         if mobj.group('channelid'):
713             paged = True
714             video_id = mobj.group('channelid')
715             api = api_base + '/channel/archives/%s.json' % video_id
716         elif mobj.group('chapterid'):
717             chapter_id = mobj.group('chapterid')
718
719             webpage = self._download_webpage(url, chapter_id)
720             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
721             if not m:
722                 raise ExtractorError(u'Cannot find archive of a chapter')
723             archive_id = m.group(1)
724
725             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
726             chapter_info_xml = self._download_webpage(api, chapter_id,
727                                              note=u'Downloading chapter information',
728                                              errnote=u'Chapter information download failed')
729             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
730             for a in doc.findall('.//archive'):
731                 if archive_id == a.find('./id').text:
732                     break
733             else:
734                 raise ExtractorError(u'Could not find chapter in chapter information')
735
736             video_url = a.find('./video_file_url').text
737             video_ext = video_url.rpartition('.')[2] or u'flv'
738
739             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
740             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
741                                    note='Downloading chapter metadata',
742                                    errnote='Download of chapter metadata failed')
743             chapter_info = json.loads(chapter_info_json)
744
745             bracket_start = int(doc.find('.//bracket_start').text)
746             bracket_end = int(doc.find('.//bracket_end').text)
747
748             # TODO determine start (and probably fix up file)
749             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
750             #video_url += u'?start=' + TODO:start_timestamp
751             # bracket_start is 13290, but we want 51670615
752             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
753                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
754
755             info = {
756                 'id': u'c' + chapter_id,
757                 'url': video_url,
758                 'ext': video_ext,
759                 'title': chapter_info['title'],
760                 'thumbnail': chapter_info['preview'],
761                 'description': chapter_info['description'],
762                 'uploader': chapter_info['channel']['display_name'],
763                 'uploader_id': chapter_info['channel']['name'],
764             }
765             return [info]
766         else:
767             video_id = mobj.group('videoid')
768             api = api_base + '/broadcast/by_archive/%s.json' % video_id
769
770         self.report_extraction(video_id)
771
772         info = []
773         offset = 0
774         limit = self._JUSTIN_PAGE_LIMIT
775         while True:
776             if paged:
777                 self.report_download_page(video_id, offset)
778             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
779             page_count, page_info = self._parse_page(page_url, video_id)
780             info.extend(page_info)
781             if not paged or page_count != limit:
782                 break
783             offset += limit
784         return info
785
786 class FunnyOrDieIE(InfoExtractor):
787     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
788
789     def _real_extract(self, url):
790         mobj = re.match(self._VALID_URL, url)
791         if mobj is None:
792             raise ExtractorError(u'invalid URL: %s' % url)
793
794         video_id = mobj.group('id')
795         webpage = self._download_webpage(url, video_id)
796
797         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
798             webpage, u'video URL', flags=re.DOTALL)
799
800         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
801             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
802
803         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
804             webpage, u'description', fatal=False, flags=re.DOTALL)
805
806         info = {
807             'id': video_id,
808             'url': video_url,
809             'ext': 'mp4',
810             'title': title,
811             'description': video_description,
812         }
813         return [info]
814
815 class SteamIE(InfoExtractor):
816     _VALID_URL = r"""http://store\.steampowered\.com/
817                 (agecheck/)?
818                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
819                 (?P<gameID>\d+)/?
820                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
821                 """
822     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
823     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
824
825     @classmethod
826     def suitable(cls, url):
827         """Receives a URL and returns True if suitable for this IE."""
828         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
829
830     def _real_extract(self, url):
831         m = re.match(self._VALID_URL, url, re.VERBOSE)
832         gameID = m.group('gameID')
833
834         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
835         webpage = self._download_webpage(videourl, gameID)
836
837         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
838             videourl = self._AGECHECK_TEMPLATE % gameID
839             self.report_age_confirmation()
840             webpage = self._download_webpage(videourl, gameID)
841
842         self.report_extraction(gameID)
843         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
844                                              webpage, 'game title')
845
846         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
847         mweb = re.finditer(urlRE, webpage)
848         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
849         titles = re.finditer(namesRE, webpage)
850         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
851         thumbs = re.finditer(thumbsRE, webpage)
852         videos = []
853         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
854             video_id = vid.group('videoID')
855             title = vtitle.group('videoName')
856             video_url = vid.group('videoURL')
857             video_thumb = thumb.group('thumbnail')
858             if not video_url:
859                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
860             info = {
861                 'id':video_id,
862                 'url':video_url,
863                 'ext': 'flv',
864                 'title': unescapeHTML(title),
865                 'thumbnail': video_thumb
866                   }
867             videos.append(info)
868         return [self.playlist_result(videos, gameID, game_title)]
869
870 class UstreamIE(InfoExtractor):
871     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
872     IE_NAME = u'ustream'
873
874     def _real_extract(self, url):
875         m = re.match(self._VALID_URL, url)
876         video_id = m.group('videoID')
877
878         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
879         webpage = self._download_webpage(url, video_id)
880
881         self.report_extraction(video_id)
882
883         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
884             webpage, u'title')
885
886         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
887             webpage, u'uploader', fatal=False, flags=re.DOTALL)
888
889         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
890             webpage, u'thumbnail', fatal=False)
891
892         info = {
893                 'id': video_id,
894                 'url': video_url,
895                 'ext': 'flv',
896                 'title': video_title,
897                 'uploader': uploader,
898                 'thumbnail': thumbnail,
899                }
900         return info
901
902 class WorldStarHipHopIE(InfoExtractor):
903     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
904     IE_NAME = u'WorldStarHipHop'
905
906     def _real_extract(self, url):
907         m = re.match(self._VALID_URL, url)
908         video_id = m.group('id')
909
910         webpage_src = self._download_webpage(url, video_id)
911
912         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
913             webpage_src, u'video URL')
914
915         if 'mp4' in video_url:
916             ext = 'mp4'
917         else:
918             ext = 'flv'
919
920         video_title = self._html_search_regex(r"<title>(.*)</title>",
921             webpage_src, u'title')
922
923         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
924         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
925             webpage_src, u'thumbnail', fatal=False)
926
927         if not thumbnail:
928             _title = r"""candytitles.*>(.*)</span>"""
929             mobj = re.search(_title, webpage_src)
930             if mobj is not None:
931                 video_title = mobj.group(1)
932
933         results = [{
934                     'id': video_id,
935                     'url' : video_url,
936                     'title' : video_title,
937                     'thumbnail' : thumbnail,
938                     'ext' : ext,
939                     }]
940         return results
941
942 class RBMARadioIE(InfoExtractor):
943     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
944
945     def _real_extract(self, url):
946         m = re.match(self._VALID_URL, url)
947         video_id = m.group('videoID')
948
949         webpage = self._download_webpage(url, video_id)
950
951         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
952             webpage, u'json data', flags=re.MULTILINE)
953
954         try:
955             data = json.loads(json_data)
956         except ValueError as e:
957             raise ExtractorError(u'Invalid JSON: ' + str(e))
958
959         video_url = data['akamai_url'] + '&cbr=256'
960         url_parts = compat_urllib_parse_urlparse(video_url)
961         video_ext = url_parts.path.rpartition('.')[2]
962         info = {
963                 'id': video_id,
964                 'url': video_url,
965                 'ext': video_ext,
966                 'title': data['title'],
967                 'description': data.get('teaser_text'),
968                 'location': data.get('country_of_origin'),
969                 'uploader': data.get('host', {}).get('name'),
970                 'uploader_id': data.get('host', {}).get('slug'),
971                 'thumbnail': data.get('image', {}).get('large_url_2x'),
972                 'duration': data.get('duration'),
973         }
974         return [info]
975
976
977 class YouPornIE(InfoExtractor):
978     """Information extractor for youporn.com."""
979     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
980
981     def _print_formats(self, formats):
982         """Print all available formats"""
983         print(u'Available formats:')
984         print(u'ext\t\tformat')
985         print(u'---------------------------------')
986         for format in formats:
987             print(u'%s\t\t%s'  % (format['ext'], format['format']))
988
989     def _specific(self, req_format, formats):
990         for x in formats:
991             if(x["format"]==req_format):
992                 return x
993         return None
994
995     def _real_extract(self, url):
996         mobj = re.match(self._VALID_URL, url)
997         if mobj is None:
998             raise ExtractorError(u'Invalid URL: %s' % url)
999         video_id = mobj.group('videoid')
1000
1001         req = compat_urllib_request.Request(url)
1002         req.add_header('Cookie', 'age_verified=1')
1003         webpage = self._download_webpage(req, video_id)
1004
1005         # Get JSON parameters
1006         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1007         try:
1008             params = json.loads(json_params)
1009         except:
1010             raise ExtractorError(u'Invalid JSON')
1011
1012         self.report_extraction(video_id)
1013         try:
1014             video_title = params['title']
1015             upload_date = unified_strdate(params['release_date_f'])
1016             video_description = params['description']
1017             video_uploader = params['submitted_by']
1018             thumbnail = params['thumbnails'][0]['image']
1019         except KeyError:
1020             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1021
1022         # Get all of the formats available
1023         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1024         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1025             webpage, u'download list').strip()
1026
1027         # Get all of the links from the page
1028         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1029         links = re.findall(LINK_RE, download_list_html)
1030         if(len(links) == 0):
1031             raise ExtractorError(u'ERROR: no known formats available for video')
1032
1033         self.to_screen(u'Links found: %d' % len(links))
1034
1035         formats = []
1036         for link in links:
1037
1038             # A link looks like this:
1039             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1040             # A path looks like this:
1041             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1042             video_url = unescapeHTML( link )
1043             path = compat_urllib_parse_urlparse( video_url ).path
1044             extension = os.path.splitext( path )[1][1:]
1045             format = path.split('/')[4].split('_')[:2]
1046             size = format[0]
1047             bitrate = format[1]
1048             format = "-".join( format )
1049             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1050
1051             formats.append({
1052                 'id': video_id,
1053                 'url': video_url,
1054                 'uploader': video_uploader,
1055                 'upload_date': upload_date,
1056                 'title': video_title,
1057                 'ext': extension,
1058                 'format': format,
1059                 'thumbnail': thumbnail,
1060                 'description': video_description
1061             })
1062
1063         if self._downloader.params.get('listformats', None):
1064             self._print_formats(formats)
1065             return
1066
1067         req_format = self._downloader.params.get('format', None)
1068         self.to_screen(u'Format: %s' % req_format)
1069
1070         if req_format is None or req_format == 'best':
1071             return [formats[0]]
1072         elif req_format == 'worst':
1073             return [formats[-1]]
1074         elif req_format in ('-1', 'all'):
1075             return formats
1076         else:
1077             format = self._specific( req_format, formats )
1078             if result is None:
1079                 raise ExtractorError(u'Requested format not available')
1080             return [format]
1081
1082
1083
1084 class PornotubeIE(InfoExtractor):
1085     """Information extractor for pornotube.com."""
1086     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1087
1088     def _real_extract(self, url):
1089         mobj = re.match(self._VALID_URL, url)
1090         if mobj is None:
1091             raise ExtractorError(u'Invalid URL: %s' % url)
1092
1093         video_id = mobj.group('videoid')
1094         video_title = mobj.group('title')
1095
1096         # Get webpage content
1097         webpage = self._download_webpage(url, video_id)
1098
1099         # Get the video URL
1100         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1101         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1102         video_url = compat_urllib_parse.unquote(video_url)
1103
1104         #Get the uploaded date
1105         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1106         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1107         if upload_date: upload_date = unified_strdate(upload_date)
1108
1109         info = {'id': video_id,
1110                 'url': video_url,
1111                 'uploader': None,
1112                 'upload_date': upload_date,
1113                 'title': video_title,
1114                 'ext': 'flv',
1115                 'format': 'flv'}
1116
1117         return [info]
1118
1119 class YouJizzIE(InfoExtractor):
1120     """Information extractor for youjizz.com."""
1121     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1122
1123     def _real_extract(self, url):
1124         mobj = re.match(self._VALID_URL, url)
1125         if mobj is None:
1126             raise ExtractorError(u'Invalid URL: %s' % url)
1127
1128         video_id = mobj.group('videoid')
1129
1130         # Get webpage content
1131         webpage = self._download_webpage(url, video_id)
1132
1133         # Get the video title
1134         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1135             webpage, u'title').strip()
1136
1137         # Get the embed page
1138         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1139         if result is None:
1140             raise ExtractorError(u'ERROR: unable to extract embed page')
1141
1142         embed_page_url = result.group(0).strip()
1143         video_id = result.group('videoid')
1144
1145         webpage = self._download_webpage(embed_page_url, video_id)
1146
1147         # Get the video URL
1148         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1149             webpage, u'video URL')
1150
1151         info = {'id': video_id,
1152                 'url': video_url,
1153                 'title': video_title,
1154                 'ext': 'flv',
1155                 'format': 'flv',
1156                 'player_url': embed_page_url}
1157
1158         return [info]
1159
1160 class EightTracksIE(InfoExtractor):
1161     IE_NAME = '8tracks'
1162     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1163
1164     def _real_extract(self, url):
1165         mobj = re.match(self._VALID_URL, url)
1166         if mobj is None:
1167             raise ExtractorError(u'Invalid URL: %s' % url)
1168         playlist_id = mobj.group('id')
1169
1170         webpage = self._download_webpage(url, playlist_id)
1171
1172         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1173         data = json.loads(json_like)
1174
1175         session = str(random.randint(0, 1000000000))
1176         mix_id = data['id']
1177         track_count = data['tracks_count']
1178         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1179         next_url = first_url
1180         res = []
1181         for i in itertools.count():
1182             api_json = self._download_webpage(next_url, playlist_id,
1183                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1184                 errnote=u'Failed to download song information')
1185             api_data = json.loads(api_json)
1186             track_data = api_data[u'set']['track']
1187             info = {
1188                 'id': track_data['id'],
1189                 'url': track_data['track_file_stream_url'],
1190                 'title': track_data['performer'] + u' - ' + track_data['name'],
1191                 'raw_title': track_data['name'],
1192                 'uploader_id': data['user']['login'],
1193                 'ext': 'm4a',
1194             }
1195             res.append(info)
1196             if api_data['set']['at_last_track']:
1197                 break
1198             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1199         return res
1200
1201 class KeekIE(InfoExtractor):
1202     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1203     IE_NAME = u'keek'
1204
1205     def _real_extract(self, url):
1206         m = re.match(self._VALID_URL, url)
1207         video_id = m.group('videoID')
1208
1209         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1210         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1211         webpage = self._download_webpage(url, video_id)
1212
1213         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1214             webpage, u'title')
1215
1216         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1217             webpage, u'uploader', fatal=False)
1218
1219         info = {
1220                 'id': video_id,
1221                 'url': video_url,
1222                 'ext': 'mp4',
1223                 'title': video_title,
1224                 'thumbnail': thumbnail,
1225                 'uploader': uploader
1226         }
1227         return [info]
1228
1229 class TEDIE(InfoExtractor):
1230     _VALID_URL=r'''http://www\.ted\.com/
1231                    (
1232                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1233                         |
1234                         ((?P<type_talk>talks)) # We have a simple talk
1235                    )
1236                    (/lang/(.*?))? # The url may contain the language
1237                    /(?P<name>\w+) # Here goes the name and then ".html"
1238                    '''
1239
1240     @classmethod
1241     def suitable(cls, url):
1242         """Receives a URL and returns True if suitable for this IE."""
1243         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1244
1245     def _real_extract(self, url):
1246         m=re.match(self._VALID_URL, url, re.VERBOSE)
1247         if m.group('type_talk'):
1248             return [self._talk_info(url)]
1249         else :
1250             playlist_id=m.group('playlist_id')
1251             name=m.group('name')
1252             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1253             return [self._playlist_videos_info(url,name,playlist_id)]
1254
1255     def _playlist_videos_info(self,url,name,playlist_id=0):
1256         '''Returns the videos of the playlist'''
1257         video_RE=r'''
1258                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1259                      ([.\s]*?)data-playlist_item_id="(\d+)"
1260                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1261                      '''
1262         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1263         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1264         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1265         m_names=re.finditer(video_name_RE,webpage)
1266
1267         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1268                                                  webpage, 'playlist title')
1269
1270         playlist_entries = []
1271         for m_video, m_name in zip(m_videos,m_names):
1272             video_id=m_video.group('video_id')
1273             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1274             playlist_entries.append(self.url_result(talk_url, 'TED'))
1275         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1276
1277     def _talk_info(self, url, video_id=0):
1278         """Return the video for the talk in the url"""
1279         m = re.match(self._VALID_URL, url,re.VERBOSE)
1280         video_name = m.group('name')
1281         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1282         self.report_extraction(video_name)
1283         # If the url includes the language we get the title translated
1284         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1285                                         webpage, 'title')
1286         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1287                                     webpage, 'json data')
1288         info = json.loads(json_data)
1289         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1290                                        webpage, 'description', flags = re.DOTALL)
1291         
1292         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1293                                        webpage, 'thumbnail')
1294         info = {
1295                 'id': info['id'],
1296                 'url': info['htmlStreams'][-1]['file'],
1297                 'ext': 'mp4',
1298                 'title': title,
1299                 'thumbnail': thumbnail,
1300                 'description': desc,
1301                 }
1302         return info
1303
1304 class MySpassIE(InfoExtractor):
1305     _VALID_URL = r'http://www.myspass.de/.*'
1306
1307     def _real_extract(self, url):
1308         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1309
1310         # video id is the last path element of the URL
1311         # usually there is a trailing slash, so also try the second but last
1312         url_path = compat_urllib_parse_urlparse(url).path
1313         url_parent_path, video_id = os.path.split(url_path)
1314         if not video_id:
1315             _, video_id = os.path.split(url_parent_path)
1316
1317         # get metadata
1318         metadata_url = META_DATA_URL_TEMPLATE % video_id
1319         metadata_text = self._download_webpage(metadata_url, video_id)
1320         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1321
1322         # extract values from metadata
1323         url_flv_el = metadata.find('url_flv')
1324         if url_flv_el is None:
1325             raise ExtractorError(u'Unable to extract download url')
1326         video_url = url_flv_el.text
1327         extension = os.path.splitext(video_url)[1][1:]
1328         title_el = metadata.find('title')
1329         if title_el is None:
1330             raise ExtractorError(u'Unable to extract title')
1331         title = title_el.text
1332         format_id_el = metadata.find('format_id')
1333         if format_id_el is None:
1334             format = ext
1335         else:
1336             format = format_id_el.text
1337         description_el = metadata.find('description')
1338         if description_el is not None:
1339             description = description_el.text
1340         else:
1341             description = None
1342         imagePreview_el = metadata.find('imagePreview')
1343         if imagePreview_el is not None:
1344             thumbnail = imagePreview_el.text
1345         else:
1346             thumbnail = None
1347         info = {
1348             'id': video_id,
1349             'url': video_url,
1350             'title': title,
1351             'ext': extension,
1352             'format': format,
1353             'thumbnail': thumbnail,
1354             'description': description
1355         }
1356         return [info]
1357
1358 class SpiegelIE(InfoExtractor):
1359     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1360
1361     def _real_extract(self, url):
1362         m = re.match(self._VALID_URL, url)
1363         video_id = m.group('videoID')
1364
1365         webpage = self._download_webpage(url, video_id)
1366
1367         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1368             webpage, u'title')
1369
1370         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1371         xml_code = self._download_webpage(xml_url, video_id,
1372                     note=u'Downloading XML', errnote=u'Failed to download XML')
1373
1374         idoc = xml.etree.ElementTree.fromstring(xml_code)
1375         last_type = idoc[-1]
1376         filename = last_type.findall('./filename')[0].text
1377         duration = float(last_type.findall('./duration')[0].text)
1378
1379         video_url = 'http://video2.spiegel.de/flash/' + filename
1380         video_ext = filename.rpartition('.')[2]
1381         info = {
1382             'id': video_id,
1383             'url': video_url,
1384             'ext': video_ext,
1385             'title': video_title,
1386             'duration': duration,
1387         }
1388         return [info]
1389
1390 class LiveLeakIE(InfoExtractor):
1391
1392     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1393     IE_NAME = u'liveleak'
1394
1395     def _real_extract(self, url):
1396         mobj = re.match(self._VALID_URL, url)
1397         if mobj is None:
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399
1400         video_id = mobj.group('video_id')
1401
1402         webpage = self._download_webpage(url, video_id)
1403
1404         video_url = self._search_regex(r'file: "(.*?)",',
1405             webpage, u'video URL')
1406
1407         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1408             webpage, u'title').replace('LiveLeak.com -', '').strip()
1409
1410         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1411             webpage, u'description', fatal=False)
1412
1413         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1414             webpage, u'uploader', fatal=False)
1415
1416         info = {
1417             'id':  video_id,
1418             'url': video_url,
1419             'ext': 'mp4',
1420             'title': video_title,
1421             'description': video_description,
1422             'uploader': video_uploader
1423         }
1424
1425         return [info]
1426
1427
1428
1429 class TumblrIE(InfoExtractor):
1430     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1431
1432     def _real_extract(self, url):
1433         m_url = re.match(self._VALID_URL, url)
1434         video_id = m_url.group('id')
1435         blog = m_url.group('blog_name')
1436
1437         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1438         webpage = self._download_webpage(url, video_id)
1439
1440         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1441         video = re.search(re_video, webpage)
1442         if video is None:
1443            raise ExtractorError(u'Unable to extract video')
1444         video_url = video.group('video_url')
1445         ext = video.group('ext')
1446
1447         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1448             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1449         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1450
1451         # The only place where you can get a title, it's not complete,
1452         # but searching in other places doesn't work for all videos
1453         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1454             webpage, u'title', flags=re.DOTALL)
1455
1456         return [{'id': video_id,
1457                  'url': video_url,
1458                  'title': video_title,
1459                  'thumbnail': video_thumbnail,
1460                  'ext': ext
1461                  }]
1462
1463 class BandcampIE(InfoExtractor):
1464     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1465
1466     def _real_extract(self, url):
1467         mobj = re.match(self._VALID_URL, url)
1468         title = mobj.group('title')
1469         webpage = self._download_webpage(url, title)
1470         # We get the link to the free download page
1471         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1472         if m_download is None:
1473             raise ExtractorError(u'No free songs found')
1474
1475         download_link = m_download.group(1)
1476         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1477                        webpage, re.MULTILINE|re.DOTALL).group('id')
1478
1479         download_webpage = self._download_webpage(download_link, id,
1480                                                   'Downloading free downloads page')
1481         # We get the dictionary of the track from some javascrip code
1482         info = re.search(r'items: (.*?),$',
1483                          download_webpage, re.MULTILINE).group(1)
1484         info = json.loads(info)[0]
1485         # We pick mp3-320 for now, until format selection can be easily implemented.
1486         mp3_info = info[u'downloads'][u'mp3-320']
1487         # If we try to use this url it says the link has expired
1488         initial_url = mp3_info[u'url']
1489         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1490         m_url = re.match(re_url, initial_url)
1491         #We build the url we will use to get the final track url
1492         # This url is build in Bandcamp in the script download_bunde_*.js
1493         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1494         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1495         # If we could correctly generate the .rand field the url would be
1496         #in the "download_url" key
1497         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1498
1499         track_info = {'id':id,
1500                       'title' : info[u'title'],
1501                       'ext' :   'mp3',
1502                       'url' :   final_url,
1503                       'thumbnail' : info[u'thumb_url'],
1504                       'uploader' :  info[u'artist']
1505                       }
1506
1507         return [track_info]
1508
1509 class RedTubeIE(InfoExtractor):
1510     """Information Extractor for redtube"""
1511     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1512
1513     def _real_extract(self,url):
1514         mobj = re.match(self._VALID_URL, url)
1515         if mobj is None:
1516             raise ExtractorError(u'Invalid URL: %s' % url)
1517
1518         video_id = mobj.group('id')
1519         video_extension = 'mp4'        
1520         webpage = self._download_webpage(url, video_id)
1521
1522         self.report_extraction(video_id)
1523
1524         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1525             webpage, u'video URL')
1526
1527         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1528             webpage, u'title')
1529
1530         return [{
1531             'id':       video_id,
1532             'url':      video_url,
1533             'ext':      video_extension,
1534             'title':    video_title,
1535         }]
1536         
1537 class InaIE(InfoExtractor):
1538     """Information Extractor for Ina.fr"""
1539     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1540
1541     def _real_extract(self,url):
1542         mobj = re.match(self._VALID_URL, url)
1543
1544         video_id = mobj.group('id')
1545         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1546         video_extension = 'mp4'
1547         webpage = self._download_webpage(mrss_url, video_id)
1548
1549         self.report_extraction(video_id)
1550
1551         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1552             webpage, u'video URL')
1553
1554         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1555             webpage, u'title')
1556
1557         return [{
1558             'id':       video_id,
1559             'url':      video_url,
1560             'ext':      video_extension,
1561             'title':    video_title,
1562         }]
1563
1564 class HowcastIE(InfoExtractor):
1565     """Information Extractor for Howcast.com"""
1566     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1567
1568     def _real_extract(self, url):
1569         mobj = re.match(self._VALID_URL, url)
1570
1571         video_id = mobj.group('id')
1572         webpage_url = 'http://www.howcast.com/videos/' + video_id
1573         webpage = self._download_webpage(webpage_url, video_id)
1574
1575         self.report_extraction(video_id)
1576
1577         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1578             webpage, u'video URL')
1579
1580         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1581             webpage, u'title')
1582
1583         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1584             webpage, u'description', fatal=False)
1585
1586         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1587             webpage, u'thumbnail', fatal=False)
1588
1589         return [{
1590             'id':       video_id,
1591             'url':      video_url,
1592             'ext':      'mp4',
1593             'title':    video_title,
1594             'description': video_description,
1595             'thumbnail': thumbnail,
1596         }]
1597
1598 class VineIE(InfoExtractor):
1599     """Information Extractor for Vine.co"""
1600     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1601
1602     def _real_extract(self, url):
1603         mobj = re.match(self._VALID_URL, url)
1604
1605         video_id = mobj.group('id')
1606         webpage_url = 'https://vine.co/v/' + video_id
1607         webpage = self._download_webpage(webpage_url, video_id)
1608
1609         self.report_extraction(video_id)
1610
1611         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1612             webpage, u'video URL')
1613
1614         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1615             webpage, u'title')
1616
1617         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1618             webpage, u'thumbnail', fatal=False)
1619
1620         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1621             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1622
1623         return [{
1624             'id':        video_id,
1625             'url':       video_url,
1626             'ext':       'mp4',
1627             'title':     video_title,
1628             'thumbnail': thumbnail,
1629             'uploader':  uploader,
1630         }]
1631
1632 class FlickrIE(InfoExtractor):
1633     """Information Extractor for Flickr videos"""
1634     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1635
1636     def _real_extract(self, url):
1637         mobj = re.match(self._VALID_URL, url)
1638
1639         video_id = mobj.group('id')
1640         video_uploader_id = mobj.group('uploader_id')
1641         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1642         webpage = self._download_webpage(webpage_url, video_id)
1643
1644         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1645
1646         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1647         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1648
1649         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1650             first_xml, u'node_id')
1651
1652         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1653         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1654
1655         self.report_extraction(video_id)
1656
1657         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1658         if mobj is None:
1659             raise ExtractorError(u'Unable to extract video url')
1660         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1661
1662         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1663             webpage, u'video title')
1664
1665         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1666             webpage, u'description', fatal=False)
1667
1668         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1669             webpage, u'thumbnail', fatal=False)
1670
1671         return [{
1672             'id':          video_id,
1673             'url':         video_url,
1674             'ext':         'mp4',
1675             'title':       video_title,
1676             'description': video_description,
1677             'thumbnail':   thumbnail,
1678             'uploader_id': video_uploader_id,
1679         }]
1680
1681 class TeamcocoIE(InfoExtractor):
1682     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1683
1684     def _real_extract(self, url):
1685         mobj = re.match(self._VALID_URL, url)
1686         if mobj is None:
1687             raise ExtractorError(u'Invalid URL: %s' % url)
1688         url_title = mobj.group('url_title')
1689         webpage = self._download_webpage(url, url_title)
1690
1691         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1692             webpage, u'video id')
1693
1694         self.report_extraction(video_id)
1695
1696         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1697             webpage, u'title')
1698
1699         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1700             webpage, u'thumbnail', fatal=False)
1701
1702         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1703             webpage, u'description', fatal=False)
1704
1705         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1706         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1707
1708         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1709             data, u'video URL')
1710
1711         return [{
1712             'id':          video_id,
1713             'url':         video_url,
1714             'ext':         'mp4',
1715             'title':       video_title,
1716             'thumbnail':   thumbnail,
1717             'description': video_description,
1718         }]
1719
1720 class XHamsterIE(InfoExtractor):
1721     """Information Extractor for xHamster"""
1722     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1723
1724     def _real_extract(self,url):
1725         mobj = re.match(self._VALID_URL, url)
1726
1727         video_id = mobj.group('id')
1728         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1729         webpage = self._download_webpage(mrss_url, video_id)
1730
1731         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1732         if mobj is None:
1733             raise ExtractorError(u'Unable to extract media URL')
1734         if len(mobj.group('server')) == 0:
1735             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1736         else:
1737             video_url = mobj.group('server')+'/key='+mobj.group('file')
1738         video_extension = video_url.split('.')[-1]
1739
1740         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1741             webpage, u'title')
1742
1743         # Can't see the description anywhere in the UI
1744         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1745         #     webpage, u'description', fatal=False)
1746         # if video_description: video_description = unescapeHTML(video_description)
1747
1748         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1749         if mobj:
1750             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1751         else:
1752             video_upload_date = None
1753             self._downloader.report_warning(u'Unable to extract upload date')
1754
1755         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1756             webpage, u'uploader id', default=u'anonymous')
1757
1758         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1759             webpage, u'thumbnail', fatal=False)
1760
1761         return [{
1762             'id':       video_id,
1763             'url':      video_url,
1764             'ext':      video_extension,
1765             'title':    video_title,
1766             # 'description': video_description,
1767             'upload_date': video_upload_date,
1768             'uploader_id': video_uploader_id,
1769             'thumbnail': video_thumbnail
1770         }]
1771
1772 class HypemIE(InfoExtractor):
1773     """Information Extractor for hypem"""
1774     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1775
1776     def _real_extract(self, url):
1777         mobj = re.match(self._VALID_URL, url)
1778         if mobj is None:
1779             raise ExtractorError(u'Invalid URL: %s' % url)
1780         track_id = mobj.group(1)
1781
1782         data = { 'ax': 1, 'ts': time.time() }
1783         data_encoded = compat_urllib_parse.urlencode(data)
1784         complete_url = url + "?" + data_encoded
1785         request = compat_urllib_request.Request(complete_url)
1786         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1787         cookie = urlh.headers.get('Set-Cookie', '')
1788
1789         self.report_extraction(track_id)
1790
1791         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1792             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1793         try:
1794             track_list = json.loads(html_tracks)
1795             track = track_list[u'tracks'][0]
1796         except ValueError:
1797             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1798
1799         key = track[u"key"]
1800         track_id = track[u"id"]
1801         artist = track[u"artist"]
1802         title = track[u"song"]
1803
1804         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1805         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1806         request.add_header('cookie', cookie)
1807         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1808         try:
1809             song_data = json.loads(song_data_json)
1810         except ValueError:
1811             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1812         final_url = song_data[u"url"]
1813
1814         return [{
1815             'id':       track_id,
1816             'url':      final_url,
1817             'ext':      "mp3",
1818             'title':    title,
1819             'artist':   artist,
1820         }]
1821
1822 class Vbox7IE(InfoExtractor):
1823     """Information Extractor for Vbox7"""
1824     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1825
1826     def _real_extract(self,url):
1827         mobj = re.match(self._VALID_URL, url)
1828         if mobj is None:
1829             raise ExtractorError(u'Invalid URL: %s' % url)
1830         video_id = mobj.group(1)
1831
1832         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1833         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1834         redirect_url = urlh.geturl() + new_location
1835         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1836
1837         title = self._html_search_regex(r'<title>(.*)</title>',
1838             webpage, u'title').split('/')[0].strip()
1839
1840         ext = "flv"
1841         info_url = "http://vbox7.com/play/magare.do"
1842         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1843         info_request = compat_urllib_request.Request(info_url, data)
1844         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1845         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1846         if info_response is None:
1847             raise ExtractorError(u'Unable to extract the media url')
1848         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1849
1850         return [{
1851             'id':        video_id,
1852             'url':       final_url,
1853             'ext':       ext,
1854             'title':     title,
1855             'thumbnail': thumbnail_url,
1856         }]
1857
1858
1859 def gen_extractors():
1860     """ Return a list of an instance of every supported extractor.
1861     The order does matter; the first extractor matched is the one handling the URL.
1862     """
1863     return [
1864         YoutubePlaylistIE(),
1865         YoutubeChannelIE(),
1866         YoutubeUserIE(),
1867         YoutubeSearchIE(),
1868         YoutubeIE(),
1869         MetacafeIE(),
1870         DailymotionIE(),
1871         GoogleSearchIE(),
1872         PhotobucketIE(),
1873         YahooIE(),
1874         YahooSearchIE(),
1875         DepositFilesIE(),
1876         FacebookIE(),
1877         BlipTVIE(),
1878         BlipTVUserIE(),
1879         VimeoIE(),
1880         MyVideoIE(),
1881         ComedyCentralIE(),
1882         EscapistIE(),
1883         CollegeHumorIE(),
1884         XVideosIE(),
1885         SoundcloudSetIE(),
1886         SoundcloudIE(),
1887         InfoQIE(),
1888         MixcloudIE(),
1889         StanfordOpenClassroomIE(),
1890         MTVIE(),
1891         YoukuIE(),
1892         XNXXIE(),
1893         YouJizzIE(),
1894         PornotubeIE(),
1895         YouPornIE(),
1896         GooglePlusIE(),
1897         ArteTvIE(),
1898         NBAIE(),
1899         WorldStarHipHopIE(),
1900         JustinTVIE(),
1901         FunnyOrDieIE(),
1902         SteamIE(),
1903         UstreamIE(),
1904         RBMARadioIE(),
1905         EightTracksIE(),
1906         KeekIE(),
1907         TEDIE(),
1908         MySpassIE(),
1909         SpiegelIE(),
1910         LiveLeakIE(),
1911         ARDIE(),
1912         ZDFIE(),
1913         TumblrIE(),
1914         BandcampIE(),
1915         RedTubeIE(),
1916         InaIE(),
1917         HowcastIE(),
1918         VineIE(),
1919         FlickrIE(),
1920         TeamcocoIE(),
1921         XHamsterIE(),
1922         HypemIE(),
1923         Vbox7IE(),
1924         GametrailersIE(),
1925         StatigramIE(),
1926         GenericIE()
1927     ]
1928
1929 def get_info_extractor(ie_name):
1930     """Returns the info extractor class with the given ie_name"""
1931     return globals()[ie_name+'IE']