fcc94db2ce7196fc0a0a7323d6991fba0920e988
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
25
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.gametrailers import GametrailersIE
30 from .extractor.generic import GenericIE
31 from .extractor.metacafe import MetacafeIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.vimeo import VimeoIE
35 from .extractor.yahoo import YahooIE
36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
37 from .extractor.zdf import ZDFIE
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 class GoogleSearchIE(SearchInfoExtractor):
53     """Information Extractor for Google Video search queries."""
54     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
55     _MAX_RESULTS = 1000
56     IE_NAME = u'video.google:search'
57     _SEARCH_KEY = 'gvsearch'
58
59     def _get_n_results(self, query, n):
60         """Get a specified number of results for a query"""
61
62         res = {
63             '_type': 'playlist',
64             'id': query,
65             'entries': []
66         }
67
68         for pagenum in itertools.count(1):
69             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
70             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
71                                              note='Downloading result page ' + str(pagenum))
72
73             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
74                 e = {
75                     '_type': 'url',
76                     'url': mobj.group(1)
77                 }
78                 res['entries'].append(e)
79
80             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
81                 return res
82
83 class YahooSearchIE(SearchInfoExtractor):
84     """Information Extractor for Yahoo! Video search queries."""
85
86     _MAX_RESULTS = 1000
87     IE_NAME = u'screen.yahoo:search'
88     _SEARCH_KEY = 'yvsearch'
89
90     def _get_n_results(self, query, n):
91         """Get a specified number of results for a query"""
92
93         res = {
94             '_type': 'playlist',
95             'id': query,
96             'entries': []
97         }
98         for pagenum in itertools.count(0): 
99             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
100             webpage = self._download_webpage(result_url, query,
101                                              note='Downloading results page '+str(pagenum+1))
102             info = json.loads(webpage)
103             m = info[u'm']
104             results = info[u'results']
105
106             for (i, r) in enumerate(results):
107                 if (pagenum * 30) +i >= n:
108                     break
109                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
110                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
111                 res['entries'].append(e)
112             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
113                 break
114
115         return res
116
117
118 class BlipTVUserIE(InfoExtractor):
119     """Information Extractor for blip.tv users."""
120
121     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
122     _PAGE_SIZE = 12
123     IE_NAME = u'blip.tv:user'
124
125     def _real_extract(self, url):
126         # Extract username
127         mobj = re.match(self._VALID_URL, url)
128         if mobj is None:
129             raise ExtractorError(u'Invalid URL: %s' % url)
130
131         username = mobj.group(1)
132
133         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
134
135         page = self._download_webpage(url, username, u'Downloading user page')
136         mobj = re.search(r'data-users-id="([^"]+)"', page)
137         page_base = page_base % mobj.group(1)
138
139
140         # Download video ids using BlipTV Ajax calls. Result size per
141         # query is limited (currently to 12 videos) so we need to query
142         # page by page until there are no video ids - it means we got
143         # all of them.
144
145         video_ids = []
146         pagenum = 1
147
148         while True:
149             url = page_base + "&page=" + str(pagenum)
150             page = self._download_webpage(url, username,
151                                           u'Downloading video ids from page %d' % pagenum)
152
153             # Extract video identifiers
154             ids_in_page = []
155
156             for mobj in re.finditer(r'href="/([^"]+)"', page):
157                 if mobj.group(1) not in ids_in_page:
158                     ids_in_page.append(unescapeHTML(mobj.group(1)))
159
160             video_ids.extend(ids_in_page)
161
162             # A little optimization - if current page is not
163             # "full", ie. does not contain PAGE_SIZE video ids then
164             # we can assume that this page is the last one - there
165             # are no more ids on further pages - no need to query
166             # again.
167
168             if len(ids_in_page) < self._PAGE_SIZE:
169                 break
170
171             pagenum += 1
172
173         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
174         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
175         return [self.playlist_result(url_entries, playlist_title = username)]
176
177
178 class DepositFilesIE(InfoExtractor):
179     """Information extractor for depositfiles.com"""
180
181     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
182
183     def _real_extract(self, url):
184         file_id = url.split('/')[-1]
185         # Rebuild url in english locale
186         url = 'http://depositfiles.com/en/files/' + file_id
187
188         # Retrieve file webpage with 'Free download' button pressed
189         free_download_indication = { 'gateway_result' : '1' }
190         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
191         try:
192             self.report_download_webpage(file_id)
193             webpage = compat_urllib_request.urlopen(request).read()
194         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
195             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
196
197         # Search for the real file URL
198         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
199         if (mobj is None) or (mobj.group(1) is None):
200             # Try to figure out reason of the error.
201             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
202             if (mobj is not None) and (mobj.group(1) is not None):
203                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
204                 raise ExtractorError(u'%s' % restriction_message)
205             else:
206                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
207
208         file_url = mobj.group(1)
209         file_extension = os.path.splitext(file_url)[1][1:]
210
211         # Search for file title
212         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
213
214         return [{
215             'id':       file_id.decode('utf-8'),
216             'url':      file_url.decode('utf-8'),
217             'uploader': None,
218             'upload_date':  None,
219             'title':    file_title,
220             'ext':      file_extension.decode('utf-8'),
221         }]
222
223
224 class FacebookIE(InfoExtractor):
225     """Information Extractor for Facebook"""
226
227     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
228     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
229     _NETRC_MACHINE = 'facebook'
230     IE_NAME = u'facebook'
231
232     def report_login(self):
233         """Report attempt to log in."""
234         self.to_screen(u'Logging in')
235
236     def _real_initialize(self):
237         if self._downloader is None:
238             return
239
240         useremail = None
241         password = None
242         downloader_params = self._downloader.params
243
244         # Attempt to use provided username and password or .netrc data
245         if downloader_params.get('username', None) is not None:
246             useremail = downloader_params['username']
247             password = downloader_params['password']
248         elif downloader_params.get('usenetrc', False):
249             try:
250                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
251                 if info is not None:
252                     useremail = info[0]
253                     password = info[2]
254                 else:
255                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
256             except (IOError, netrc.NetrcParseError) as err:
257                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
258                 return
259
260         if useremail is None:
261             return
262
263         # Log in
264         login_form = {
265             'email': useremail,
266             'pass': password,
267             'login': 'Log+In'
268             }
269         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
270         try:
271             self.report_login()
272             login_results = compat_urllib_request.urlopen(request).read()
273             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
274                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
275                 return
276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
277             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
278             return
279
280     def _real_extract(self, url):
281         mobj = re.match(self._VALID_URL, url)
282         if mobj is None:
283             raise ExtractorError(u'Invalid URL: %s' % url)
284         video_id = mobj.group('ID')
285
286         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
287         webpage = self._download_webpage(url, video_id)
288
289         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
290         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
291         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
292         if not m:
293             raise ExtractorError(u'Cannot parse data')
294         data = dict(json.loads(m.group(1)))
295         params_raw = compat_urllib_parse.unquote(data['params'])
296         params = json.loads(params_raw)
297         video_data = params['video_data'][0]
298         video_url = video_data.get('hd_src')
299         if not video_url:
300             video_url = video_data['sd_src']
301         if not video_url:
302             raise ExtractorError(u'Cannot find video URL')
303         video_duration = int(video_data['video_duration'])
304         thumbnail = video_data['thumbnail_src']
305
306         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
307             webpage, u'title')
308
309         info = {
310             'id': video_id,
311             'title': video_title,
312             'url': video_url,
313             'ext': 'mp4',
314             'duration': video_duration,
315             'thumbnail': thumbnail,
316         }
317         return [info]
318
319
320 class BlipTVIE(InfoExtractor):
321     """Information extractor for blip.tv"""
322
323     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
324     _URL_EXT = r'^.*\.([a-z0-9]+)$'
325     IE_NAME = u'blip.tv'
326
327     def report_direct_download(self, title):
328         """Report information extraction."""
329         self.to_screen(u'%s: Direct download detected' % title)
330
331     def _real_extract(self, url):
332         mobj = re.match(self._VALID_URL, url)
333         if mobj is None:
334             raise ExtractorError(u'Invalid URL: %s' % url)
335
336         # See https://github.com/rg3/youtube-dl/issues/857
337         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
338         if api_mobj is not None:
339             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
340         urlp = compat_urllib_parse_urlparse(url)
341         if urlp.path.startswith('/play/'):
342             request = compat_urllib_request.Request(url)
343             response = compat_urllib_request.urlopen(request)
344             redirecturl = response.geturl()
345             rurlp = compat_urllib_parse_urlparse(redirecturl)
346             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
347             url = 'http://blip.tv/a/a-' + file_id
348             return self._real_extract(url)
349
350
351         if '?' in url:
352             cchar = '&'
353         else:
354             cchar = '?'
355         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
356         request = compat_urllib_request.Request(json_url)
357         request.add_header('User-Agent', 'iTunes/10.6.1')
358         self.report_extraction(mobj.group(1))
359         info = None
360         try:
361             urlh = compat_urllib_request.urlopen(request)
362             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
363                 basename = url.split('/')[-1]
364                 title,ext = os.path.splitext(basename)
365                 title = title.decode('UTF-8')
366                 ext = ext.replace('.', '')
367                 self.report_direct_download(title)
368                 info = {
369                     'id': title,
370                     'url': url,
371                     'uploader': None,
372                     'upload_date': None,
373                     'title': title,
374                     'ext': ext,
375                     'urlhandle': urlh
376                 }
377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
379         if info is None: # Regular URL
380             try:
381                 json_code_bytes = urlh.read()
382                 json_code = json_code_bytes.decode('utf-8')
383             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
384                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
385
386             try:
387                 json_data = json.loads(json_code)
388                 if 'Post' in json_data:
389                     data = json_data['Post']
390                 else:
391                     data = json_data
392
393                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
394                 video_url = data['media']['url']
395                 umobj = re.match(self._URL_EXT, video_url)
396                 if umobj is None:
397                     raise ValueError('Can not determine filename extension')
398                 ext = umobj.group(1)
399
400                 info = {
401                     'id': data['item_id'],
402                     'url': video_url,
403                     'uploader': data['display_name'],
404                     'upload_date': upload_date,
405                     'title': data['title'],
406                     'ext': ext,
407                     'format': data['media']['mimeType'],
408                     'thumbnail': data['thumbnailUrl'],
409                     'description': data['description'],
410                     'player_url': data['embedUrl'],
411                     'user_agent': 'iTunes/10.6.1',
412                 }
413             except (ValueError,KeyError) as err:
414                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
415
416         return [info]
417
418
419 class MyVideoIE(InfoExtractor):
420     """Information Extractor for myvideo.de."""
421
422     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
423     IE_NAME = u'myvideo'
424
425     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
426     # Released into the Public Domain by Tristan Fischer on 2013-05-19
427     # https://github.com/rg3/youtube-dl/pull/842
428     def __rc4crypt(self,data, key):
429         x = 0
430         box = list(range(256))
431         for i in list(range(256)):
432             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
433             box[i], box[x] = box[x], box[i]
434         x = 0
435         y = 0
436         out = ''
437         for char in data:
438             x = (x + 1) % 256
439             y = (y + box[x]) % 256
440             box[x], box[y] = box[y], box[x]
441             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
442         return out
443
444     def __md5(self,s):
445         return hashlib.md5(s).hexdigest().encode()
446
447     def _real_extract(self,url):
448         mobj = re.match(self._VALID_URL, url)
449         if mobj is None:
450             raise ExtractorError(u'invalid URL: %s' % url)
451
452         video_id = mobj.group(1)
453
454         GK = (
455           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
456           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
457           b'TnpsbA0KTVRkbU1tSTRNdz09'
458         )
459
460         # Get video webpage
461         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
462         webpage = self._download_webpage(webpage_url, video_id)
463
464         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
465         if mobj is not None:
466             self.report_extraction(video_id)
467             video_url = mobj.group(1) + '.flv'
468
469             video_title = self._html_search_regex('<title>([^<]+)</title>',
470                 webpage, u'title')
471
472             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
473
474             return [{
475                 'id':       video_id,
476                 'url':      video_url,
477                 'uploader': None,
478                 'upload_date':  None,
479                 'title':    video_title,
480                 'ext':      u'flv',
481             }]
482
483         # try encxml
484         mobj = re.search('var flashvars={(.+?)}', webpage)
485         if mobj is None:
486             raise ExtractorError(u'Unable to extract video')
487
488         params = {}
489         encxml = ''
490         sec = mobj.group(1)
491         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
492             if not a == '_encxml':
493                 params[a] = b
494             else:
495                 encxml = compat_urllib_parse.unquote(b)
496         if not params.get('domain'):
497             params['domain'] = 'www.myvideo.de'
498         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
499         if 'flash_playertype=MTV' in xmldata_url:
500             self._downloader.report_warning(u'avoiding MTV player')
501             xmldata_url = (
502                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
503                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
504             ) % video_id
505
506         # get enc data
507         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
508         enc_data_b = binascii.unhexlify(enc_data)
509         sk = self.__md5(
510             base64.b64decode(base64.b64decode(GK)) +
511             self.__md5(
512                 str(video_id).encode('utf-8')
513             )
514         )
515         dec_data = self.__rc4crypt(enc_data_b, sk)
516
517         # extracting infos
518         self.report_extraction(video_id)
519
520         video_url = None
521         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
522         if mobj:
523             video_url = compat_urllib_parse.unquote(mobj.group(1))
524             if 'myvideo2flash' in video_url:
525                 self._downloader.report_warning(u'forcing RTMPT ...')
526                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
527
528         if not video_url:
529             # extract non rtmp videos
530             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
531             if mobj is None:
532                 raise ExtractorError(u'unable to extract url')
533             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
534
535         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
536         video_file = compat_urllib_parse.unquote(video_file)
537
538         if not video_file.endswith('f4m'):
539             ppath, prefix = video_file.split('.')
540             video_playpath = '%s:%s' % (prefix, ppath)
541             video_hls_playlist = ''
542         else:
543             video_playpath = ''
544             video_hls_playlist = (
545                 video_filepath + video_file
546             ).replace('.f4m', '.m3u8')
547
548         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
549         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
550
551         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
552             webpage, u'title')
553
554         return [{
555             'id':                 video_id,
556             'url':                video_url,
557             'tc_url':             video_url,
558             'uploader':           None,
559             'upload_date':        None,
560             'title':              video_title,
561             'ext':                u'flv',
562             'play_path':          video_playpath,
563             'video_file':         video_file,
564             'video_hls_playlist': video_hls_playlist,
565             'player_url':         video_swfobj,
566         }]
567
568
569 class ComedyCentralIE(InfoExtractor):
570     """Information extractor for The Daily Show and Colbert Report """
571
572     # urls can be abbreviations like :thedailyshow or :colbert
573     # urls for episodes like:
574     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
575     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
576     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
577     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
578                       |(https?://)?(www\.)?
579                           (?P<showname>thedailyshow|colbertnation)\.com/
580                          (full-episodes/(?P<episode>.*)|
581                           (?P<clip>
582                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
583                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
584                      $"""
585
586     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
587
588     _video_extensions = {
589         '3500': 'mp4',
590         '2200': 'mp4',
591         '1700': 'mp4',
592         '1200': 'mp4',
593         '750': 'mp4',
594         '400': 'mp4',
595     }
596     _video_dimensions = {
597         '3500': '1280x720',
598         '2200': '960x540',
599         '1700': '768x432',
600         '1200': '640x360',
601         '750': '512x288',
602         '400': '384x216',
603     }
604
605     @classmethod
606     def suitable(cls, url):
607         """Receives a URL and returns True if suitable for this IE."""
608         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
609
610     def _print_formats(self, formats):
611         print('Available formats:')
612         for x in formats:
613             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
614
615
616     def _real_extract(self, url):
617         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
618         if mobj is None:
619             raise ExtractorError(u'Invalid URL: %s' % url)
620
621         if mobj.group('shortname'):
622             if mobj.group('shortname') in ('tds', 'thedailyshow'):
623                 url = u'http://www.thedailyshow.com/full-episodes/'
624             else:
625                 url = u'http://www.colbertnation.com/full-episodes/'
626             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
627             assert mobj is not None
628
629         if mobj.group('clip'):
630             if mobj.group('showname') == 'thedailyshow':
631                 epTitle = mobj.group('tdstitle')
632             else:
633                 epTitle = mobj.group('cntitle')
634             dlNewest = False
635         else:
636             dlNewest = not mobj.group('episode')
637             if dlNewest:
638                 epTitle = mobj.group('showname')
639             else:
640                 epTitle = mobj.group('episode')
641
642         self.report_extraction(epTitle)
643         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
644         if dlNewest:
645             url = htmlHandle.geturl()
646             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
647             if mobj is None:
648                 raise ExtractorError(u'Invalid redirected URL: ' + url)
649             if mobj.group('episode') == '':
650                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
651             epTitle = mobj.group('episode')
652
653         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
654
655         if len(mMovieParams) == 0:
656             # The Colbert Report embeds the information in a without
657             # a URL prefix; so extract the alternate reference
658             # and then add the URL prefix manually.
659
660             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
661             if len(altMovieParams) == 0:
662                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
663             else:
664                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
665
666         uri = mMovieParams[0][1]
667         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
668         indexXml = self._download_webpage(indexUrl, epTitle,
669                                           u'Downloading show index',
670                                           u'unable to download episode index')
671
672         results = []
673
674         idoc = xml.etree.ElementTree.fromstring(indexXml)
675         itemEls = idoc.findall('.//item')
676         for partNum,itemEl in enumerate(itemEls):
677             mediaId = itemEl.findall('./guid')[0].text
678             shortMediaId = mediaId.split(':')[-1]
679             showId = mediaId.split(':')[-2].replace('.com', '')
680             officialTitle = itemEl.findall('./title')[0].text
681             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
682
683             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
684                         compat_urllib_parse.urlencode({'uri': mediaId}))
685             configXml = self._download_webpage(configUrl, epTitle,
686                                                u'Downloading configuration for %s' % shortMediaId)
687
688             cdoc = xml.etree.ElementTree.fromstring(configXml)
689             turls = []
690             for rendition in cdoc.findall('.//rendition'):
691                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
692                 turls.append(finfo)
693
694             if len(turls) == 0:
695                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
696                 continue
697
698             if self._downloader.params.get('listformats', None):
699                 self._print_formats([i[0] for i in turls])
700                 return
701
702             # For now, just pick the highest bitrate
703             format,rtmp_video_url = turls[-1]
704
705             # Get the format arg from the arg stream
706             req_format = self._downloader.params.get('format', None)
707
708             # Select format if we can find one
709             for f,v in turls:
710                 if f == req_format:
711                     format, rtmp_video_url = f, v
712                     break
713
714             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
715             if not m:
716                 raise ExtractorError(u'Cannot transform RTMP url')
717             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
718             video_url = base + m.group('finalid')
719
720             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
721             info = {
722                 'id': shortMediaId,
723                 'url': video_url,
724                 'uploader': showId,
725                 'upload_date': officialDate,
726                 'title': effTitle,
727                 'ext': 'mp4',
728                 'format': format,
729                 'thumbnail': None,
730                 'description': officialTitle,
731             }
732             results.append(info)
733
734         return results
735
736
737 class EscapistIE(InfoExtractor):
738     """Information extractor for The Escapist """
739
740     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
741     IE_NAME = u'escapist'
742
743     def _real_extract(self, url):
744         mobj = re.match(self._VALID_URL, url)
745         if mobj is None:
746             raise ExtractorError(u'Invalid URL: %s' % url)
747         showName = mobj.group('showname')
748         videoId = mobj.group('episode')
749
750         self.report_extraction(videoId)
751         webpage = self._download_webpage(url, videoId)
752
753         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
754             webpage, u'description', fatal=False)
755
756         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
757             webpage, u'thumbnail', fatal=False)
758
759         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
760             webpage, u'player url')
761
762         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
763             webpage, u'player url').split(' : ')[-1]
764
765         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
766         configUrl = compat_urllib_parse.unquote(configUrl)
767
768         configJSON = self._download_webpage(configUrl, videoId,
769                                             u'Downloading configuration',
770                                             u'unable to download configuration')
771
772         # Technically, it's JavaScript, not JSON
773         configJSON = configJSON.replace("'", '"')
774
775         try:
776             config = json.loads(configJSON)
777         except (ValueError,) as err:
778             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
779
780         playlist = config['playlist']
781         videoUrl = playlist[1]['url']
782
783         info = {
784             'id': videoId,
785             'url': videoUrl,
786             'uploader': showName,
787             'upload_date': None,
788             'title': title,
789             'ext': 'mp4',
790             'thumbnail': imgUrl,
791             'description': videoDesc,
792             'player_url': playerUrl,
793         }
794
795         return [info]
796
797 class CollegeHumorIE(InfoExtractor):
798     """Information extractor for collegehumor.com"""
799
800     _WORKING = False
801     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
802     IE_NAME = u'collegehumor'
803
804     def report_manifest(self, video_id):
805         """Report information extraction."""
806         self.to_screen(u'%s: Downloading XML manifest' % video_id)
807
808     def _real_extract(self, url):
809         mobj = re.match(self._VALID_URL, url)
810         if mobj is None:
811             raise ExtractorError(u'Invalid URL: %s' % url)
812         video_id = mobj.group('videoid')
813
814         info = {
815             'id': video_id,
816             'uploader': None,
817             'upload_date': None,
818         }
819
820         self.report_extraction(video_id)
821         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
822         try:
823             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
824         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
825             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
826
827         mdoc = xml.etree.ElementTree.fromstring(metaXml)
828         try:
829             videoNode = mdoc.findall('./video')[0]
830             info['description'] = videoNode.findall('./description')[0].text
831             info['title'] = videoNode.findall('./caption')[0].text
832             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
833             manifest_url = videoNode.findall('./file')[0].text
834         except IndexError:
835             raise ExtractorError(u'Invalid metadata XML file')
836
837         manifest_url += '?hdcore=2.10.3'
838         self.report_manifest(video_id)
839         try:
840             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
841         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
842             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
843
844         adoc = xml.etree.ElementTree.fromstring(manifestXml)
845         try:
846             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
847             node_id = media_node.attrib['url']
848             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
849         except IndexError as err:
850             raise ExtractorError(u'Invalid manifest file')
851
852         url_pr = compat_urllib_parse_urlparse(manifest_url)
853         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
854
855         info['url'] = url
856         info['ext'] = 'f4f'
857         return [info]
858
859
860 class XVideosIE(InfoExtractor):
861     """Information extractor for xvideos.com"""
862
863     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
864     IE_NAME = u'xvideos'
865
866     def _real_extract(self, url):
867         mobj = re.match(self._VALID_URL, url)
868         if mobj is None:
869             raise ExtractorError(u'Invalid URL: %s' % url)
870         video_id = mobj.group(1)
871
872         webpage = self._download_webpage(url, video_id)
873
874         self.report_extraction(video_id)
875
876         # Extract video URL
877         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
878             webpage, u'video URL'))
879
880         # Extract title
881         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
882             webpage, u'title')
883
884         # Extract video thumbnail
885         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
886             webpage, u'thumbnail', fatal=False)
887
888         info = {
889             'id': video_id,
890             'url': video_url,
891             'uploader': None,
892             'upload_date': None,
893             'title': video_title,
894             'ext': 'flv',
895             'thumbnail': video_thumbnail,
896             'description': None,
897         }
898
899         return [info]
900
901
902 class SoundcloudIE(InfoExtractor):
903     """Information extractor for soundcloud.com
904        To access the media, the uid of the song and a stream token
905        must be extracted from the page source and the script must make
906        a request to media.soundcloud.com/crossdomain.xml. Then
907        the media can be grabbed by requesting from an url composed
908        of the stream token and uid
909      """
910
911     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
912     IE_NAME = u'soundcloud'
913
914     def report_resolve(self, video_id):
915         """Report information extraction."""
916         self.to_screen(u'%s: Resolving id' % video_id)
917
918     def _real_extract(self, url):
919         mobj = re.match(self._VALID_URL, url)
920         if mobj is None:
921             raise ExtractorError(u'Invalid URL: %s' % url)
922
923         # extract uploader (which is in the url)
924         uploader = mobj.group(1)
925         # extract simple title (uploader + slug of song title)
926         slug_title =  mobj.group(2)
927         simple_title = uploader + u'-' + slug_title
928         full_title = '%s/%s' % (uploader, slug_title)
929
930         self.report_resolve(full_title)
931
932         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
933         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
934         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
935
936         info = json.loads(info_json)
937         video_id = info['id']
938         self.report_extraction(full_title)
939
940         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
941         stream_json = self._download_webpage(streams_url, full_title,
942                                              u'Downloading stream definitions',
943                                              u'unable to download stream definitions')
944
945         streams = json.loads(stream_json)
946         mediaURL = streams['http_mp3_128_url']
947         upload_date = unified_strdate(info['created_at'])
948
949         return [{
950             'id':       info['id'],
951             'url':      mediaURL,
952             'uploader': info['user']['username'],
953             'upload_date': upload_date,
954             'title':    info['title'],
955             'ext':      u'mp3',
956             'description': info['description'],
957         }]
958
959 class SoundcloudSetIE(InfoExtractor):
960     """Information extractor for soundcloud.com sets
961        To access the media, the uid of the song and a stream token
962        must be extracted from the page source and the script must make
963        a request to media.soundcloud.com/crossdomain.xml. Then
964        the media can be grabbed by requesting from an url composed
965        of the stream token and uid
966      """
967
968     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
969     IE_NAME = u'soundcloud:set'
970
971     def report_resolve(self, video_id):
972         """Report information extraction."""
973         self.to_screen(u'%s: Resolving id' % video_id)
974
975     def _real_extract(self, url):
976         mobj = re.match(self._VALID_URL, url)
977         if mobj is None:
978             raise ExtractorError(u'Invalid URL: %s' % url)
979
980         # extract uploader (which is in the url)
981         uploader = mobj.group(1)
982         # extract simple title (uploader + slug of song title)
983         slug_title =  mobj.group(2)
984         simple_title = uploader + u'-' + slug_title
985         full_title = '%s/sets/%s' % (uploader, slug_title)
986
987         self.report_resolve(full_title)
988
989         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
990         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
991         info_json = self._download_webpage(resolv_url, full_title)
992
993         videos = []
994         info = json.loads(info_json)
995         if 'errors' in info:
996             for err in info['errors']:
997                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
998             return
999
1000         self.report_extraction(full_title)
1001         for track in info['tracks']:
1002             video_id = track['id']
1003
1004             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1005             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1006
1007             self.report_extraction(video_id)
1008             streams = json.loads(stream_json)
1009             mediaURL = streams['http_mp3_128_url']
1010
1011             videos.append({
1012                 'id':       video_id,
1013                 'url':      mediaURL,
1014                 'uploader': track['user']['username'],
1015                 'upload_date':  unified_strdate(track['created_at']),
1016                 'title':    track['title'],
1017                 'ext':      u'mp3',
1018                 'description': track['description'],
1019             })
1020         return videos
1021
1022
1023 class InfoQIE(InfoExtractor):
1024     """Information extractor for infoq.com"""
1025     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1026
1027     def _real_extract(self, url):
1028         mobj = re.match(self._VALID_URL, url)
1029         if mobj is None:
1030             raise ExtractorError(u'Invalid URL: %s' % url)
1031
1032         webpage = self._download_webpage(url, video_id=url)
1033         self.report_extraction(url)
1034
1035         # Extract video URL
1036         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1037         if mobj is None:
1038             raise ExtractorError(u'Unable to extract video url')
1039         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1040         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1041
1042         # Extract title
1043         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1044             webpage, u'title')
1045
1046         # Extract description
1047         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1048             webpage, u'description', fatal=False)
1049
1050         video_filename = video_url.split('/')[-1]
1051         video_id, extension = video_filename.split('.')
1052
1053         info = {
1054             'id': video_id,
1055             'url': video_url,
1056             'uploader': None,
1057             'upload_date': None,
1058             'title': video_title,
1059             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1060             'thumbnail': None,
1061             'description': video_description,
1062         }
1063
1064         return [info]
1065
1066 class MixcloudIE(InfoExtractor):
1067     """Information extractor for www.mixcloud.com"""
1068
1069     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1070     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1071     IE_NAME = u'mixcloud'
1072
1073     def report_download_json(self, file_id):
1074         """Report JSON download."""
1075         self.to_screen(u'Downloading json')
1076
1077     def get_urls(self, jsonData, fmt, bitrate='best'):
1078         """Get urls from 'audio_formats' section in json"""
1079         file_url = None
1080         try:
1081             bitrate_list = jsonData[fmt]
1082             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1083                 bitrate = max(bitrate_list) # select highest
1084
1085             url_list = jsonData[fmt][bitrate]
1086         except TypeError: # we have no bitrate info.
1087             url_list = jsonData[fmt]
1088         return url_list
1089
1090     def check_urls(self, url_list):
1091         """Returns 1st active url from list"""
1092         for url in url_list:
1093             try:
1094                 compat_urllib_request.urlopen(url)
1095                 return url
1096             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1097                 url = None
1098
1099         return None
1100
1101     def _print_formats(self, formats):
1102         print('Available formats:')
1103         for fmt in formats.keys():
1104             for b in formats[fmt]:
1105                 try:
1106                     ext = formats[fmt][b][0]
1107                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1108                 except TypeError: # we have no bitrate info
1109                     ext = formats[fmt][0]
1110                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1111                     break
1112
1113     def _real_extract(self, url):
1114         mobj = re.match(self._VALID_URL, url)
1115         if mobj is None:
1116             raise ExtractorError(u'Invalid URL: %s' % url)
1117         # extract uploader & filename from url
1118         uploader = mobj.group(1).decode('utf-8')
1119         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1120
1121         # construct API request
1122         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1123         # retrieve .json file with links to files
1124         request = compat_urllib_request.Request(file_url)
1125         try:
1126             self.report_download_json(file_url)
1127             jsonData = compat_urllib_request.urlopen(request).read()
1128         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1130
1131         # parse JSON
1132         json_data = json.loads(jsonData)
1133         player_url = json_data['player_swf_url']
1134         formats = dict(json_data['audio_formats'])
1135
1136         req_format = self._downloader.params.get('format', None)
1137         bitrate = None
1138
1139         if self._downloader.params.get('listformats', None):
1140             self._print_formats(formats)
1141             return
1142
1143         if req_format is None or req_format == 'best':
1144             for format_param in formats.keys():
1145                 url_list = self.get_urls(formats, format_param)
1146                 # check urls
1147                 file_url = self.check_urls(url_list)
1148                 if file_url is not None:
1149                     break # got it!
1150         else:
1151             if req_format not in formats:
1152                 raise ExtractorError(u'Format is not available')
1153
1154             url_list = self.get_urls(formats, req_format)
1155             file_url = self.check_urls(url_list)
1156             format_param = req_format
1157
1158         return [{
1159             'id': file_id.decode('utf-8'),
1160             'url': file_url.decode('utf-8'),
1161             'uploader': uploader.decode('utf-8'),
1162             'upload_date': None,
1163             'title': json_data['name'],
1164             'ext': file_url.split('.')[-1].decode('utf-8'),
1165             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1166             'thumbnail': json_data['thumbnail_url'],
1167             'description': json_data['description'],
1168             'player_url': player_url.decode('utf-8'),
1169         }]
1170
1171 class StanfordOpenClassroomIE(InfoExtractor):
1172     """Information extractor for Stanford's Open ClassRoom"""
1173
1174     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1175     IE_NAME = u'stanfordoc'
1176
1177     def _real_extract(self, url):
1178         mobj = re.match(self._VALID_URL, url)
1179         if mobj is None:
1180             raise ExtractorError(u'Invalid URL: %s' % url)
1181
1182         if mobj.group('course') and mobj.group('video'): # A specific video
1183             course = mobj.group('course')
1184             video = mobj.group('video')
1185             info = {
1186                 'id': course + '_' + video,
1187                 'uploader': None,
1188                 'upload_date': None,
1189             }
1190
1191             self.report_extraction(info['id'])
1192             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1193             xmlUrl = baseUrl + video + '.xml'
1194             try:
1195                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1196             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1197                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1198             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1199             try:
1200                 info['title'] = mdoc.findall('./title')[0].text
1201                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1202             except IndexError:
1203                 raise ExtractorError(u'Invalid metadata XML file')
1204             info['ext'] = info['url'].rpartition('.')[2]
1205             return [info]
1206         elif mobj.group('course'): # A course page
1207             course = mobj.group('course')
1208             info = {
1209                 'id': course,
1210                 'type': 'playlist',
1211                 'uploader': None,
1212                 'upload_date': None,
1213             }
1214
1215             coursepage = self._download_webpage(url, info['id'],
1216                                         note='Downloading course info page',
1217                                         errnote='Unable to download course info page')
1218
1219             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1220
1221             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1222                 coursepage, u'description', fatal=False)
1223
1224             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1225             info['list'] = [
1226                 {
1227                     'type': 'reference',
1228                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1229                 }
1230                     for vpage in links]
1231             results = []
1232             for entry in info['list']:
1233                 assert entry['type'] == 'reference'
1234                 results += self.extract(entry['url'])
1235             return results
1236         else: # Root page
1237             info = {
1238                 'id': 'Stanford OpenClassroom',
1239                 'type': 'playlist',
1240                 'uploader': None,
1241                 'upload_date': None,
1242             }
1243
1244             self.report_download_webpage(info['id'])
1245             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1246             try:
1247                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1248             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1249                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1250
1251             info['title'] = info['id']
1252
1253             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1254             info['list'] = [
1255                 {
1256                     'type': 'reference',
1257                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1258                 }
1259                     for cpage in links]
1260
1261             results = []
1262             for entry in info['list']:
1263                 assert entry['type'] == 'reference'
1264                 results += self.extract(entry['url'])
1265             return results
1266
1267 class MTVIE(InfoExtractor):
1268     """Information extractor for MTV.com"""
1269
1270     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1271     IE_NAME = u'mtv'
1272
1273     def _real_extract(self, url):
1274         mobj = re.match(self._VALID_URL, url)
1275         if mobj is None:
1276             raise ExtractorError(u'Invalid URL: %s' % url)
1277         if not mobj.group('proto'):
1278             url = 'http://' + url
1279         video_id = mobj.group('videoid')
1280
1281         webpage = self._download_webpage(url, video_id)
1282
1283         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1284             webpage, u'song name', fatal=False)
1285
1286         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1287             webpage, u'title')
1288
1289         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1290             webpage, u'mtvn_uri', fatal=False)
1291
1292         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1293             webpage, u'content id', fatal=False)
1294
1295         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1296         self.report_extraction(video_id)
1297         request = compat_urllib_request.Request(videogen_url)
1298         try:
1299             metadataXml = compat_urllib_request.urlopen(request).read()
1300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1301             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1302
1303         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1304         renditions = mdoc.findall('.//rendition')
1305
1306         # For now, always pick the highest quality.
1307         rendition = renditions[-1]
1308
1309         try:
1310             _,_,ext = rendition.attrib['type'].partition('/')
1311             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1312             video_url = rendition.find('./src').text
1313         except KeyError:
1314             raise ExtractorError('Invalid rendition field.')
1315
1316         info = {
1317             'id': video_id,
1318             'url': video_url,
1319             'uploader': performer,
1320             'upload_date': None,
1321             'title': video_title,
1322             'ext': ext,
1323             'format': format,
1324         }
1325
1326         return [info]
1327
1328
1329 class YoukuIE(InfoExtractor):
1330     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1331
1332     def _gen_sid(self):
1333         nowTime = int(time.time() * 1000)
1334         random1 = random.randint(1000,1998)
1335         random2 = random.randint(1000,9999)
1336
1337         return "%d%d%d" %(nowTime,random1,random2)
1338
1339     def _get_file_ID_mix_string(self, seed):
1340         mixed = []
1341         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1342         seed = float(seed)
1343         for i in range(len(source)):
1344             seed  =  (seed * 211 + 30031 ) % 65536
1345             index  =  math.floor(seed / 65536 * len(source) )
1346             mixed.append(source[int(index)])
1347             source.remove(source[int(index)])
1348         #return ''.join(mixed)
1349         return mixed
1350
1351     def _get_file_id(self, fileId, seed):
1352         mixed = self._get_file_ID_mix_string(seed)
1353         ids = fileId.split('*')
1354         realId = []
1355         for ch in ids:
1356             if ch:
1357                 realId.append(mixed[int(ch)])
1358         return ''.join(realId)
1359
1360     def _real_extract(self, url):
1361         mobj = re.match(self._VALID_URL, url)
1362         if mobj is None:
1363             raise ExtractorError(u'Invalid URL: %s' % url)
1364         video_id = mobj.group('ID')
1365
1366         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1367
1368         jsondata = self._download_webpage(info_url, video_id)
1369
1370         self.report_extraction(video_id)
1371         try:
1372             config = json.loads(jsondata)
1373
1374             video_title =  config['data'][0]['title']
1375             seed = config['data'][0]['seed']
1376
1377             format = self._downloader.params.get('format', None)
1378             supported_format = list(config['data'][0]['streamfileids'].keys())
1379
1380             if format is None or format == 'best':
1381                 if 'hd2' in supported_format:
1382                     format = 'hd2'
1383                 else:
1384                     format = 'flv'
1385                 ext = u'flv'
1386             elif format == 'worst':
1387                 format = 'mp4'
1388                 ext = u'mp4'
1389             else:
1390                 format = 'flv'
1391                 ext = u'flv'
1392
1393
1394             fileid = config['data'][0]['streamfileids'][format]
1395             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1396         except (UnicodeDecodeError, ValueError, KeyError):
1397             raise ExtractorError(u'Unable to extract info section')
1398
1399         files_info=[]
1400         sid = self._gen_sid()
1401         fileid = self._get_file_id(fileid, seed)
1402
1403         #column 8,9 of fileid represent the segment number
1404         #fileid[7:9] should be changed
1405         for index, key in enumerate(keys):
1406
1407             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1408             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1409
1410             info = {
1411                 'id': '%s_part%02d' % (video_id, index),
1412                 'url': download_url,
1413                 'uploader': None,
1414                 'upload_date': None,
1415                 'title': video_title,
1416                 'ext': ext,
1417             }
1418             files_info.append(info)
1419
1420         return files_info
1421
1422
1423 class XNXXIE(InfoExtractor):
1424     """Information extractor for xnxx.com"""
1425
1426     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1427     IE_NAME = u'xnxx'
1428     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1429     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1430     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1431
1432     def _real_extract(self, url):
1433         mobj = re.match(self._VALID_URL, url)
1434         if mobj is None:
1435             raise ExtractorError(u'Invalid URL: %s' % url)
1436         video_id = mobj.group(1)
1437
1438         # Get webpage content
1439         webpage = self._download_webpage(url, video_id)
1440
1441         video_url = self._search_regex(self.VIDEO_URL_RE,
1442             webpage, u'video URL')
1443         video_url = compat_urllib_parse.unquote(video_url)
1444
1445         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1446             webpage, u'title')
1447
1448         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1449             webpage, u'thumbnail', fatal=False)
1450
1451         return [{
1452             'id': video_id,
1453             'url': video_url,
1454             'uploader': None,
1455             'upload_date': None,
1456             'title': video_title,
1457             'ext': 'flv',
1458             'thumbnail': video_thumbnail,
1459             'description': None,
1460         }]
1461
1462
1463 class GooglePlusIE(InfoExtractor):
1464     """Information extractor for plus.google.com."""
1465
1466     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1467     IE_NAME = u'plus.google'
1468
1469     def _real_extract(self, url):
1470         # Extract id from URL
1471         mobj = re.match(self._VALID_URL, url)
1472         if mobj is None:
1473             raise ExtractorError(u'Invalid URL: %s' % url)
1474
1475         post_url = mobj.group(0)
1476         video_id = mobj.group(1)
1477
1478         video_extension = 'flv'
1479
1480         # Step 1, Retrieve post webpage to extract further information
1481         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1482
1483         self.report_extraction(video_id)
1484
1485         # Extract update date
1486         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1487             webpage, u'upload date', fatal=False)
1488         if upload_date:
1489             # Convert timestring to a format suitable for filename
1490             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1491             upload_date = upload_date.strftime('%Y%m%d')
1492
1493         # Extract uploader
1494         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1495             webpage, u'uploader', fatal=False)
1496
1497         # Extract title
1498         # Get the first line for title
1499         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1500             webpage, 'title', default=u'NA')
1501
1502         # Step 2, Stimulate clicking the image box to launch video
1503         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1504             webpage, u'video page URL')
1505         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1506
1507         # Extract video links on video page
1508         """Extract video links of all sizes"""
1509         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1510         mobj = re.findall(pattern, webpage)
1511         if len(mobj) == 0:
1512             raise ExtractorError(u'Unable to extract video links')
1513
1514         # Sort in resolution
1515         links = sorted(mobj)
1516
1517         # Choose the lowest of the sort, i.e. highest resolution
1518         video_url = links[-1]
1519         # Only get the url. The resolution part in the tuple has no use anymore
1520         video_url = video_url[-1]
1521         # Treat escaped \u0026 style hex
1522         try:
1523             video_url = video_url.decode("unicode_escape")
1524         except AttributeError: # Python 3
1525             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1526
1527
1528         return [{
1529             'id':       video_id,
1530             'url':      video_url,
1531             'uploader': uploader,
1532             'upload_date':  upload_date,
1533             'title':    video_title,
1534             'ext':      video_extension,
1535         }]
1536
1537 class NBAIE(InfoExtractor):
1538     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1539     IE_NAME = u'nba'
1540
1541     def _real_extract(self, url):
1542         mobj = re.match(self._VALID_URL, url)
1543         if mobj is None:
1544             raise ExtractorError(u'Invalid URL: %s' % url)
1545
1546         video_id = mobj.group(1)
1547
1548         webpage = self._download_webpage(url, video_id)
1549
1550         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1551
1552         shortened_video_id = video_id.rpartition('/')[2]
1553         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1554             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1555
1556         # It isn't there in the HTML it returns to us
1557         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1558
1559         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1560
1561         info = {
1562             'id': shortened_video_id,
1563             'url': video_url,
1564             'ext': 'mp4',
1565             'title': title,
1566             # 'uploader_date': uploader_date,
1567             'description': description,
1568         }
1569         return [info]
1570
1571 class JustinTVIE(InfoExtractor):
1572     """Information extractor for justin.tv and twitch.tv"""
1573     # TODO: One broadcast may be split into multiple videos. The key
1574     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1575     # starts at 1 and increases. Can we treat all parts as one video?
1576
1577     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1578         (?:
1579             (?P<channelid>[^/]+)|
1580             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1581             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1582         )
1583         /?(?:\#.*)?$
1584         """
1585     _JUSTIN_PAGE_LIMIT = 100
1586     IE_NAME = u'justin.tv'
1587
1588     def report_download_page(self, channel, offset):
1589         """Report attempt to download a single page of videos."""
1590         self.to_screen(u'%s: Downloading video information from %d to %d' %
1591                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1592
1593     # Return count of items, list of *valid* items
1594     def _parse_page(self, url, video_id):
1595         webpage = self._download_webpage(url, video_id,
1596                                          u'Downloading video info JSON',
1597                                          u'unable to download video info JSON')
1598
1599         response = json.loads(webpage)
1600         if type(response) != list:
1601             error_text = response.get('error', 'unknown error')
1602             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1603         info = []
1604         for clip in response:
1605             video_url = clip['video_file_url']
1606             if video_url:
1607                 video_extension = os.path.splitext(video_url)[1][1:]
1608                 video_date = re.sub('-', '', clip['start_time'][:10])
1609                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1610                 video_id = clip['id']
1611                 video_title = clip.get('title', video_id)
1612                 info.append({
1613                     'id': video_id,
1614                     'url': video_url,
1615                     'title': video_title,
1616                     'uploader': clip.get('channel_name', video_uploader_id),
1617                     'uploader_id': video_uploader_id,
1618                     'upload_date': video_date,
1619                     'ext': video_extension,
1620                 })
1621         return (len(response), info)
1622
1623     def _real_extract(self, url):
1624         mobj = re.match(self._VALID_URL, url)
1625         if mobj is None:
1626             raise ExtractorError(u'invalid URL: %s' % url)
1627
1628         api_base = 'http://api.justin.tv'
1629         paged = False
1630         if mobj.group('channelid'):
1631             paged = True
1632             video_id = mobj.group('channelid')
1633             api = api_base + '/channel/archives/%s.json' % video_id
1634         elif mobj.group('chapterid'):
1635             chapter_id = mobj.group('chapterid')
1636
1637             webpage = self._download_webpage(url, chapter_id)
1638             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1639             if not m:
1640                 raise ExtractorError(u'Cannot find archive of a chapter')
1641             archive_id = m.group(1)
1642
1643             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1644             chapter_info_xml = self._download_webpage(api, chapter_id,
1645                                              note=u'Downloading chapter information',
1646                                              errnote=u'Chapter information download failed')
1647             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1648             for a in doc.findall('.//archive'):
1649                 if archive_id == a.find('./id').text:
1650                     break
1651             else:
1652                 raise ExtractorError(u'Could not find chapter in chapter information')
1653
1654             video_url = a.find('./video_file_url').text
1655             video_ext = video_url.rpartition('.')[2] or u'flv'
1656
1657             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1658             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1659                                    note='Downloading chapter metadata',
1660                                    errnote='Download of chapter metadata failed')
1661             chapter_info = json.loads(chapter_info_json)
1662
1663             bracket_start = int(doc.find('.//bracket_start').text)
1664             bracket_end = int(doc.find('.//bracket_end').text)
1665
1666             # TODO determine start (and probably fix up file)
1667             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1668             #video_url += u'?start=' + TODO:start_timestamp
1669             # bracket_start is 13290, but we want 51670615
1670             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1671                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1672
1673             info = {
1674                 'id': u'c' + chapter_id,
1675                 'url': video_url,
1676                 'ext': video_ext,
1677                 'title': chapter_info['title'],
1678                 'thumbnail': chapter_info['preview'],
1679                 'description': chapter_info['description'],
1680                 'uploader': chapter_info['channel']['display_name'],
1681                 'uploader_id': chapter_info['channel']['name'],
1682             }
1683             return [info]
1684         else:
1685             video_id = mobj.group('videoid')
1686             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1687
1688         self.report_extraction(video_id)
1689
1690         info = []
1691         offset = 0
1692         limit = self._JUSTIN_PAGE_LIMIT
1693         while True:
1694             if paged:
1695                 self.report_download_page(video_id, offset)
1696             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1697             page_count, page_info = self._parse_page(page_url, video_id)
1698             info.extend(page_info)
1699             if not paged or page_count != limit:
1700                 break
1701             offset += limit
1702         return info
1703
1704 class FunnyOrDieIE(InfoExtractor):
1705     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1706
1707     def _real_extract(self, url):
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             raise ExtractorError(u'invalid URL: %s' % url)
1711
1712         video_id = mobj.group('id')
1713         webpage = self._download_webpage(url, video_id)
1714
1715         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1716             webpage, u'video URL', flags=re.DOTALL)
1717
1718         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1719             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1720
1721         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1722             webpage, u'description', fatal=False, flags=re.DOTALL)
1723
1724         info = {
1725             'id': video_id,
1726             'url': video_url,
1727             'ext': 'mp4',
1728             'title': title,
1729             'description': video_description,
1730         }
1731         return [info]
1732
1733 class SteamIE(InfoExtractor):
1734     _VALID_URL = r"""http://store\.steampowered\.com/
1735                 (agecheck/)?
1736                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1737                 (?P<gameID>\d+)/?
1738                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1739                 """
1740     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1741     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1742
1743     @classmethod
1744     def suitable(cls, url):
1745         """Receives a URL and returns True if suitable for this IE."""
1746         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1747
1748     def _real_extract(self, url):
1749         m = re.match(self._VALID_URL, url, re.VERBOSE)
1750         gameID = m.group('gameID')
1751
1752         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1753         webpage = self._download_webpage(videourl, gameID)
1754
1755         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1756             videourl = self._AGECHECK_TEMPLATE % gameID
1757             self.report_age_confirmation()
1758             webpage = self._download_webpage(videourl, gameID)
1759
1760         self.report_extraction(gameID)
1761         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1762                                              webpage, 'game title')
1763
1764         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1765         mweb = re.finditer(urlRE, webpage)
1766         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1767         titles = re.finditer(namesRE, webpage)
1768         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1769         thumbs = re.finditer(thumbsRE, webpage)
1770         videos = []
1771         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1772             video_id = vid.group('videoID')
1773             title = vtitle.group('videoName')
1774             video_url = vid.group('videoURL')
1775             video_thumb = thumb.group('thumbnail')
1776             if not video_url:
1777                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1778             info = {
1779                 'id':video_id,
1780                 'url':video_url,
1781                 'ext': 'flv',
1782                 'title': unescapeHTML(title),
1783                 'thumbnail': video_thumb
1784                   }
1785             videos.append(info)
1786         return [self.playlist_result(videos, gameID, game_title)]
1787
1788 class UstreamIE(InfoExtractor):
1789     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1790     IE_NAME = u'ustream'
1791
1792     def _real_extract(self, url):
1793         m = re.match(self._VALID_URL, url)
1794         video_id = m.group('videoID')
1795
1796         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1797         webpage = self._download_webpage(url, video_id)
1798
1799         self.report_extraction(video_id)
1800
1801         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1802             webpage, u'title')
1803
1804         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1805             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1806
1807         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1808             webpage, u'thumbnail', fatal=False)
1809
1810         info = {
1811                 'id': video_id,
1812                 'url': video_url,
1813                 'ext': 'flv',
1814                 'title': video_title,
1815                 'uploader': uploader,
1816                 'thumbnail': thumbnail,
1817                }
1818         return info
1819
1820 class WorldStarHipHopIE(InfoExtractor):
1821     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1822     IE_NAME = u'WorldStarHipHop'
1823
1824     def _real_extract(self, url):
1825         m = re.match(self._VALID_URL, url)
1826         video_id = m.group('id')
1827
1828         webpage_src = self._download_webpage(url, video_id)
1829
1830         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1831             webpage_src, u'video URL')
1832
1833         if 'mp4' in video_url:
1834             ext = 'mp4'
1835         else:
1836             ext = 'flv'
1837
1838         video_title = self._html_search_regex(r"<title>(.*)</title>",
1839             webpage_src, u'title')
1840
1841         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1842         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1843             webpage_src, u'thumbnail', fatal=False)
1844
1845         if not thumbnail:
1846             _title = r"""candytitles.*>(.*)</span>"""
1847             mobj = re.search(_title, webpage_src)
1848             if mobj is not None:
1849                 video_title = mobj.group(1)
1850
1851         results = [{
1852                     'id': video_id,
1853                     'url' : video_url,
1854                     'title' : video_title,
1855                     'thumbnail' : thumbnail,
1856                     'ext' : ext,
1857                     }]
1858         return results
1859
1860 class RBMARadioIE(InfoExtractor):
1861     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1862
1863     def _real_extract(self, url):
1864         m = re.match(self._VALID_URL, url)
1865         video_id = m.group('videoID')
1866
1867         webpage = self._download_webpage(url, video_id)
1868
1869         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1870             webpage, u'json data', flags=re.MULTILINE)
1871
1872         try:
1873             data = json.loads(json_data)
1874         except ValueError as e:
1875             raise ExtractorError(u'Invalid JSON: ' + str(e))
1876
1877         video_url = data['akamai_url'] + '&cbr=256'
1878         url_parts = compat_urllib_parse_urlparse(video_url)
1879         video_ext = url_parts.path.rpartition('.')[2]
1880         info = {
1881                 'id': video_id,
1882                 'url': video_url,
1883                 'ext': video_ext,
1884                 'title': data['title'],
1885                 'description': data.get('teaser_text'),
1886                 'location': data.get('country_of_origin'),
1887                 'uploader': data.get('host', {}).get('name'),
1888                 'uploader_id': data.get('host', {}).get('slug'),
1889                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1890                 'duration': data.get('duration'),
1891         }
1892         return [info]
1893
1894
1895 class YouPornIE(InfoExtractor):
1896     """Information extractor for youporn.com."""
1897     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1898
1899     def _print_formats(self, formats):
1900         """Print all available formats"""
1901         print(u'Available formats:')
1902         print(u'ext\t\tformat')
1903         print(u'---------------------------------')
1904         for format in formats:
1905             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1906
1907     def _specific(self, req_format, formats):
1908         for x in formats:
1909             if(x["format"]==req_format):
1910                 return x
1911         return None
1912
1913     def _real_extract(self, url):
1914         mobj = re.match(self._VALID_URL, url)
1915         if mobj is None:
1916             raise ExtractorError(u'Invalid URL: %s' % url)
1917         video_id = mobj.group('videoid')
1918
1919         req = compat_urllib_request.Request(url)
1920         req.add_header('Cookie', 'age_verified=1')
1921         webpage = self._download_webpage(req, video_id)
1922
1923         # Get JSON parameters
1924         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1925         try:
1926             params = json.loads(json_params)
1927         except:
1928             raise ExtractorError(u'Invalid JSON')
1929
1930         self.report_extraction(video_id)
1931         try:
1932             video_title = params['title']
1933             upload_date = unified_strdate(params['release_date_f'])
1934             video_description = params['description']
1935             video_uploader = params['submitted_by']
1936             thumbnail = params['thumbnails'][0]['image']
1937         except KeyError:
1938             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1939
1940         # Get all of the formats available
1941         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1942         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1943             webpage, u'download list').strip()
1944
1945         # Get all of the links from the page
1946         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1947         links = re.findall(LINK_RE, download_list_html)
1948         if(len(links) == 0):
1949             raise ExtractorError(u'ERROR: no known formats available for video')
1950
1951         self.to_screen(u'Links found: %d' % len(links))
1952
1953         formats = []
1954         for link in links:
1955
1956             # A link looks like this:
1957             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1958             # A path looks like this:
1959             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1960             video_url = unescapeHTML( link )
1961             path = compat_urllib_parse_urlparse( video_url ).path
1962             extension = os.path.splitext( path )[1][1:]
1963             format = path.split('/')[4].split('_')[:2]
1964             size = format[0]
1965             bitrate = format[1]
1966             format = "-".join( format )
1967             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1968
1969             formats.append({
1970                 'id': video_id,
1971                 'url': video_url,
1972                 'uploader': video_uploader,
1973                 'upload_date': upload_date,
1974                 'title': video_title,
1975                 'ext': extension,
1976                 'format': format,
1977                 'thumbnail': thumbnail,
1978                 'description': video_description
1979             })
1980
1981         if self._downloader.params.get('listformats', None):
1982             self._print_formats(formats)
1983             return
1984
1985         req_format = self._downloader.params.get('format', None)
1986         self.to_screen(u'Format: %s' % req_format)
1987
1988         if req_format is None or req_format == 'best':
1989             return [formats[0]]
1990         elif req_format == 'worst':
1991             return [formats[-1]]
1992         elif req_format in ('-1', 'all'):
1993             return formats
1994         else:
1995             format = self._specific( req_format, formats )
1996             if result is None:
1997                 raise ExtractorError(u'Requested format not available')
1998             return [format]
1999
2000
2001
2002 class PornotubeIE(InfoExtractor):
2003     """Information extractor for pornotube.com."""
2004     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2005
2006     def _real_extract(self, url):
2007         mobj = re.match(self._VALID_URL, url)
2008         if mobj is None:
2009             raise ExtractorError(u'Invalid URL: %s' % url)
2010
2011         video_id = mobj.group('videoid')
2012         video_title = mobj.group('title')
2013
2014         # Get webpage content
2015         webpage = self._download_webpage(url, video_id)
2016
2017         # Get the video URL
2018         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2019         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2020         video_url = compat_urllib_parse.unquote(video_url)
2021
2022         #Get the uploaded date
2023         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2024         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2025         if upload_date: upload_date = unified_strdate(upload_date)
2026
2027         info = {'id': video_id,
2028                 'url': video_url,
2029                 'uploader': None,
2030                 'upload_date': upload_date,
2031                 'title': video_title,
2032                 'ext': 'flv',
2033                 'format': 'flv'}
2034
2035         return [info]
2036
2037 class YouJizzIE(InfoExtractor):
2038     """Information extractor for youjizz.com."""
2039     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2040
2041     def _real_extract(self, url):
2042         mobj = re.match(self._VALID_URL, url)
2043         if mobj is None:
2044             raise ExtractorError(u'Invalid URL: %s' % url)
2045
2046         video_id = mobj.group('videoid')
2047
2048         # Get webpage content
2049         webpage = self._download_webpage(url, video_id)
2050
2051         # Get the video title
2052         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2053             webpage, u'title').strip()
2054
2055         # Get the embed page
2056         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2057         if result is None:
2058             raise ExtractorError(u'ERROR: unable to extract embed page')
2059
2060         embed_page_url = result.group(0).strip()
2061         video_id = result.group('videoid')
2062
2063         webpage = self._download_webpage(embed_page_url, video_id)
2064
2065         # Get the video URL
2066         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2067             webpage, u'video URL')
2068
2069         info = {'id': video_id,
2070                 'url': video_url,
2071                 'title': video_title,
2072                 'ext': 'flv',
2073                 'format': 'flv',
2074                 'player_url': embed_page_url}
2075
2076         return [info]
2077
2078 class EightTracksIE(InfoExtractor):
2079     IE_NAME = '8tracks'
2080     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2081
2082     def _real_extract(self, url):
2083         mobj = re.match(self._VALID_URL, url)
2084         if mobj is None:
2085             raise ExtractorError(u'Invalid URL: %s' % url)
2086         playlist_id = mobj.group('id')
2087
2088         webpage = self._download_webpage(url, playlist_id)
2089
2090         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2091         data = json.loads(json_like)
2092
2093         session = str(random.randint(0, 1000000000))
2094         mix_id = data['id']
2095         track_count = data['tracks_count']
2096         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2097         next_url = first_url
2098         res = []
2099         for i in itertools.count():
2100             api_json = self._download_webpage(next_url, playlist_id,
2101                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2102                 errnote=u'Failed to download song information')
2103             api_data = json.loads(api_json)
2104             track_data = api_data[u'set']['track']
2105             info = {
2106                 'id': track_data['id'],
2107                 'url': track_data['track_file_stream_url'],
2108                 'title': track_data['performer'] + u' - ' + track_data['name'],
2109                 'raw_title': track_data['name'],
2110                 'uploader_id': data['user']['login'],
2111                 'ext': 'm4a',
2112             }
2113             res.append(info)
2114             if api_data['set']['at_last_track']:
2115                 break
2116             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2117         return res
2118
2119 class KeekIE(InfoExtractor):
2120     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2121     IE_NAME = u'keek'
2122
2123     def _real_extract(self, url):
2124         m = re.match(self._VALID_URL, url)
2125         video_id = m.group('videoID')
2126
2127         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2128         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2129         webpage = self._download_webpage(url, video_id)
2130
2131         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2132             webpage, u'title')
2133
2134         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2135             webpage, u'uploader', fatal=False)
2136
2137         info = {
2138                 'id': video_id,
2139                 'url': video_url,
2140                 'ext': 'mp4',
2141                 'title': video_title,
2142                 'thumbnail': thumbnail,
2143                 'uploader': uploader
2144         }
2145         return [info]
2146
2147 class TEDIE(InfoExtractor):
2148     _VALID_URL=r'''http://www\.ted\.com/
2149                    (
2150                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2151                         |
2152                         ((?P<type_talk>talks)) # We have a simple talk
2153                    )
2154                    (/lang/(.*?))? # The url may contain the language
2155                    /(?P<name>\w+) # Here goes the name and then ".html"
2156                    '''
2157
2158     @classmethod
2159     def suitable(cls, url):
2160         """Receives a URL and returns True if suitable for this IE."""
2161         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2162
2163     def _real_extract(self, url):
2164         m=re.match(self._VALID_URL, url, re.VERBOSE)
2165         if m.group('type_talk'):
2166             return [self._talk_info(url)]
2167         else :
2168             playlist_id=m.group('playlist_id')
2169             name=m.group('name')
2170             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2171             return [self._playlist_videos_info(url,name,playlist_id)]
2172
2173     def _playlist_videos_info(self,url,name,playlist_id=0):
2174         '''Returns the videos of the playlist'''
2175         video_RE=r'''
2176                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2177                      ([.\s]*?)data-playlist_item_id="(\d+)"
2178                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2179                      '''
2180         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2181         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2182         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2183         m_names=re.finditer(video_name_RE,webpage)
2184
2185         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2186                                                  webpage, 'playlist title')
2187
2188         playlist_entries = []
2189         for m_video, m_name in zip(m_videos,m_names):
2190             video_id=m_video.group('video_id')
2191             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2192             playlist_entries.append(self.url_result(talk_url, 'TED'))
2193         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2194
2195     def _talk_info(self, url, video_id=0):
2196         """Return the video for the talk in the url"""
2197         m = re.match(self._VALID_URL, url,re.VERBOSE)
2198         video_name = m.group('name')
2199         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2200         self.report_extraction(video_name)
2201         # If the url includes the language we get the title translated
2202         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2203                                         webpage, 'title')
2204         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2205                                     webpage, 'json data')
2206         info = json.loads(json_data)
2207         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2208                                        webpage, 'description', flags = re.DOTALL)
2209         
2210         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2211                                        webpage, 'thumbnail')
2212         info = {
2213                 'id': info['id'],
2214                 'url': info['htmlStreams'][-1]['file'],
2215                 'ext': 'mp4',
2216                 'title': title,
2217                 'thumbnail': thumbnail,
2218                 'description': desc,
2219                 }
2220         return info
2221
2222 class MySpassIE(InfoExtractor):
2223     _VALID_URL = r'http://www.myspass.de/.*'
2224
2225     def _real_extract(self, url):
2226         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2227
2228         # video id is the last path element of the URL
2229         # usually there is a trailing slash, so also try the second but last
2230         url_path = compat_urllib_parse_urlparse(url).path
2231         url_parent_path, video_id = os.path.split(url_path)
2232         if not video_id:
2233             _, video_id = os.path.split(url_parent_path)
2234
2235         # get metadata
2236         metadata_url = META_DATA_URL_TEMPLATE % video_id
2237         metadata_text = self._download_webpage(metadata_url, video_id)
2238         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2239
2240         # extract values from metadata
2241         url_flv_el = metadata.find('url_flv')
2242         if url_flv_el is None:
2243             raise ExtractorError(u'Unable to extract download url')
2244         video_url = url_flv_el.text
2245         extension = os.path.splitext(video_url)[1][1:]
2246         title_el = metadata.find('title')
2247         if title_el is None:
2248             raise ExtractorError(u'Unable to extract title')
2249         title = title_el.text
2250         format_id_el = metadata.find('format_id')
2251         if format_id_el is None:
2252             format = ext
2253         else:
2254             format = format_id_el.text
2255         description_el = metadata.find('description')
2256         if description_el is not None:
2257             description = description_el.text
2258         else:
2259             description = None
2260         imagePreview_el = metadata.find('imagePreview')
2261         if imagePreview_el is not None:
2262             thumbnail = imagePreview_el.text
2263         else:
2264             thumbnail = None
2265         info = {
2266             'id': video_id,
2267             'url': video_url,
2268             'title': title,
2269             'ext': extension,
2270             'format': format,
2271             'thumbnail': thumbnail,
2272             'description': description
2273         }
2274         return [info]
2275
2276 class SpiegelIE(InfoExtractor):
2277     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2278
2279     def _real_extract(self, url):
2280         m = re.match(self._VALID_URL, url)
2281         video_id = m.group('videoID')
2282
2283         webpage = self._download_webpage(url, video_id)
2284
2285         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2286             webpage, u'title')
2287
2288         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2289         xml_code = self._download_webpage(xml_url, video_id,
2290                     note=u'Downloading XML', errnote=u'Failed to download XML')
2291
2292         idoc = xml.etree.ElementTree.fromstring(xml_code)
2293         last_type = idoc[-1]
2294         filename = last_type.findall('./filename')[0].text
2295         duration = float(last_type.findall('./duration')[0].text)
2296
2297         video_url = 'http://video2.spiegel.de/flash/' + filename
2298         video_ext = filename.rpartition('.')[2]
2299         info = {
2300             'id': video_id,
2301             'url': video_url,
2302             'ext': video_ext,
2303             'title': video_title,
2304             'duration': duration,
2305         }
2306         return [info]
2307
2308 class LiveLeakIE(InfoExtractor):
2309
2310     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2311     IE_NAME = u'liveleak'
2312
2313     def _real_extract(self, url):
2314         mobj = re.match(self._VALID_URL, url)
2315         if mobj is None:
2316             raise ExtractorError(u'Invalid URL: %s' % url)
2317
2318         video_id = mobj.group('video_id')
2319
2320         webpage = self._download_webpage(url, video_id)
2321
2322         video_url = self._search_regex(r'file: "(.*?)",',
2323             webpage, u'video URL')
2324
2325         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2326             webpage, u'title').replace('LiveLeak.com -', '').strip()
2327
2328         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2329             webpage, u'description', fatal=False)
2330
2331         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2332             webpage, u'uploader', fatal=False)
2333
2334         info = {
2335             'id':  video_id,
2336             'url': video_url,
2337             'ext': 'mp4',
2338             'title': video_title,
2339             'description': video_description,
2340             'uploader': video_uploader
2341         }
2342
2343         return [info]
2344
2345
2346
2347 class TumblrIE(InfoExtractor):
2348     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2349
2350     def _real_extract(self, url):
2351         m_url = re.match(self._VALID_URL, url)
2352         video_id = m_url.group('id')
2353         blog = m_url.group('blog_name')
2354
2355         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2356         webpage = self._download_webpage(url, video_id)
2357
2358         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2359         video = re.search(re_video, webpage)
2360         if video is None:
2361            raise ExtractorError(u'Unable to extract video')
2362         video_url = video.group('video_url')
2363         ext = video.group('ext')
2364
2365         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2366             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2367         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2368
2369         # The only place where you can get a title, it's not complete,
2370         # but searching in other places doesn't work for all videos
2371         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2372             webpage, u'title', flags=re.DOTALL)
2373
2374         return [{'id': video_id,
2375                  'url': video_url,
2376                  'title': video_title,
2377                  'thumbnail': video_thumbnail,
2378                  'ext': ext
2379                  }]
2380
2381 class BandcampIE(InfoExtractor):
2382     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2383
2384     def _real_extract(self, url):
2385         mobj = re.match(self._VALID_URL, url)
2386         title = mobj.group('title')
2387         webpage = self._download_webpage(url, title)
2388         # We get the link to the free download page
2389         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2390         if m_download is None:
2391             raise ExtractorError(u'No free songs found')
2392
2393         download_link = m_download.group(1)
2394         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
2395                        webpage, re.MULTILINE|re.DOTALL).group('id')
2396
2397         download_webpage = self._download_webpage(download_link, id,
2398                                                   'Downloading free downloads page')
2399         # We get the dictionary of the track from some javascrip code
2400         info = re.search(r'items: (.*?),$',
2401                          download_webpage, re.MULTILINE).group(1)
2402         info = json.loads(info)[0]
2403         # We pick mp3-320 for now, until format selection can be easily implemented.
2404         mp3_info = info[u'downloads'][u'mp3-320']
2405         # If we try to use this url it says the link has expired
2406         initial_url = mp3_info[u'url']
2407         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2408         m_url = re.match(re_url, initial_url)
2409         #We build the url we will use to get the final track url
2410         # This url is build in Bandcamp in the script download_bunde_*.js
2411         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2412         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2413         # If we could correctly generate the .rand field the url would be
2414         #in the "download_url" key
2415         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2416
2417         track_info = {'id':id,
2418                       'title' : info[u'title'],
2419                       'ext' :   'mp3',
2420                       'url' :   final_url,
2421                       'thumbnail' : info[u'thumb_url'],
2422                       'uploader' :  info[u'artist']
2423                       }
2424
2425         return [track_info]
2426
2427 class RedTubeIE(InfoExtractor):
2428     """Information Extractor for redtube"""
2429     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2430
2431     def _real_extract(self,url):
2432         mobj = re.match(self._VALID_URL, url)
2433         if mobj is None:
2434             raise ExtractorError(u'Invalid URL: %s' % url)
2435
2436         video_id = mobj.group('id')
2437         video_extension = 'mp4'        
2438         webpage = self._download_webpage(url, video_id)
2439
2440         self.report_extraction(video_id)
2441
2442         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2443             webpage, u'video URL')
2444
2445         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2446             webpage, u'title')
2447
2448         return [{
2449             'id':       video_id,
2450             'url':      video_url,
2451             'ext':      video_extension,
2452             'title':    video_title,
2453         }]
2454         
2455 class InaIE(InfoExtractor):
2456     """Information Extractor for Ina.fr"""
2457     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2458
2459     def _real_extract(self,url):
2460         mobj = re.match(self._VALID_URL, url)
2461
2462         video_id = mobj.group('id')
2463         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2464         video_extension = 'mp4'
2465         webpage = self._download_webpage(mrss_url, video_id)
2466
2467         self.report_extraction(video_id)
2468
2469         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2470             webpage, u'video URL')
2471
2472         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2473             webpage, u'title')
2474
2475         return [{
2476             'id':       video_id,
2477             'url':      video_url,
2478             'ext':      video_extension,
2479             'title':    video_title,
2480         }]
2481
2482 class HowcastIE(InfoExtractor):
2483     """Information Extractor for Howcast.com"""
2484     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2485
2486     def _real_extract(self, url):
2487         mobj = re.match(self._VALID_URL, url)
2488
2489         video_id = mobj.group('id')
2490         webpage_url = 'http://www.howcast.com/videos/' + video_id
2491         webpage = self._download_webpage(webpage_url, video_id)
2492
2493         self.report_extraction(video_id)
2494
2495         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2496             webpage, u'video URL')
2497
2498         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2499             webpage, u'title')
2500
2501         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2502             webpage, u'description', fatal=False)
2503
2504         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2505             webpage, u'thumbnail', fatal=False)
2506
2507         return [{
2508             'id':       video_id,
2509             'url':      video_url,
2510             'ext':      'mp4',
2511             'title':    video_title,
2512             'description': video_description,
2513             'thumbnail': thumbnail,
2514         }]
2515
2516 class VineIE(InfoExtractor):
2517     """Information Extractor for Vine.co"""
2518     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2519
2520     def _real_extract(self, url):
2521         mobj = re.match(self._VALID_URL, url)
2522
2523         video_id = mobj.group('id')
2524         webpage_url = 'https://vine.co/v/' + video_id
2525         webpage = self._download_webpage(webpage_url, video_id)
2526
2527         self.report_extraction(video_id)
2528
2529         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2530             webpage, u'video URL')
2531
2532         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2533             webpage, u'title')
2534
2535         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2536             webpage, u'thumbnail', fatal=False)
2537
2538         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2539             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2540
2541         return [{
2542             'id':        video_id,
2543             'url':       video_url,
2544             'ext':       'mp4',
2545             'title':     video_title,
2546             'thumbnail': thumbnail,
2547             'uploader':  uploader,
2548         }]
2549
2550 class FlickrIE(InfoExtractor):
2551     """Information Extractor for Flickr videos"""
2552     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2553
2554     def _real_extract(self, url):
2555         mobj = re.match(self._VALID_URL, url)
2556
2557         video_id = mobj.group('id')
2558         video_uploader_id = mobj.group('uploader_id')
2559         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2560         webpage = self._download_webpage(webpage_url, video_id)
2561
2562         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2563
2564         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2565         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2566
2567         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2568             first_xml, u'node_id')
2569
2570         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2571         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2572
2573         self.report_extraction(video_id)
2574
2575         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2576         if mobj is None:
2577             raise ExtractorError(u'Unable to extract video url')
2578         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2579
2580         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2581             webpage, u'video title')
2582
2583         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2584             webpage, u'description', fatal=False)
2585
2586         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2587             webpage, u'thumbnail', fatal=False)
2588
2589         return [{
2590             'id':          video_id,
2591             'url':         video_url,
2592             'ext':         'mp4',
2593             'title':       video_title,
2594             'description': video_description,
2595             'thumbnail':   thumbnail,
2596             'uploader_id': video_uploader_id,
2597         }]
2598
2599 class TeamcocoIE(InfoExtractor):
2600     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2601
2602     def _real_extract(self, url):
2603         mobj = re.match(self._VALID_URL, url)
2604         if mobj is None:
2605             raise ExtractorError(u'Invalid URL: %s' % url)
2606         url_title = mobj.group('url_title')
2607         webpage = self._download_webpage(url, url_title)
2608
2609         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2610             webpage, u'video id')
2611
2612         self.report_extraction(video_id)
2613
2614         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2615             webpage, u'title')
2616
2617         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2618             webpage, u'thumbnail', fatal=False)
2619
2620         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2621             webpage, u'description', fatal=False)
2622
2623         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2624         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2625
2626         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2627             data, u'video URL')
2628
2629         return [{
2630             'id':          video_id,
2631             'url':         video_url,
2632             'ext':         'mp4',
2633             'title':       video_title,
2634             'thumbnail':   thumbnail,
2635             'description': video_description,
2636         }]
2637
2638 class XHamsterIE(InfoExtractor):
2639     """Information Extractor for xHamster"""
2640     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2641
2642     def _real_extract(self,url):
2643         mobj = re.match(self._VALID_URL, url)
2644
2645         video_id = mobj.group('id')
2646         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2647         webpage = self._download_webpage(mrss_url, video_id)
2648
2649         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2650         if mobj is None:
2651             raise ExtractorError(u'Unable to extract media URL')
2652         if len(mobj.group('server')) == 0:
2653             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2654         else:
2655             video_url = mobj.group('server')+'/key='+mobj.group('file')
2656         video_extension = video_url.split('.')[-1]
2657
2658         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2659             webpage, u'title')
2660
2661         # Can't see the description anywhere in the UI
2662         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2663         #     webpage, u'description', fatal=False)
2664         # if video_description: video_description = unescapeHTML(video_description)
2665
2666         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2667         if mobj:
2668             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2669         else:
2670             video_upload_date = None
2671             self._downloader.report_warning(u'Unable to extract upload date')
2672
2673         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2674             webpage, u'uploader id', default=u'anonymous')
2675
2676         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2677             webpage, u'thumbnail', fatal=False)
2678
2679         return [{
2680             'id':       video_id,
2681             'url':      video_url,
2682             'ext':      video_extension,
2683             'title':    video_title,
2684             # 'description': video_description,
2685             'upload_date': video_upload_date,
2686             'uploader_id': video_uploader_id,
2687             'thumbnail': video_thumbnail
2688         }]
2689
2690 class HypemIE(InfoExtractor):
2691     """Information Extractor for hypem"""
2692     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2693
2694     def _real_extract(self, url):
2695         mobj = re.match(self._VALID_URL, url)
2696         if mobj is None:
2697             raise ExtractorError(u'Invalid URL: %s' % url)
2698         track_id = mobj.group(1)
2699
2700         data = { 'ax': 1, 'ts': time.time() }
2701         data_encoded = compat_urllib_parse.urlencode(data)
2702         complete_url = url + "?" + data_encoded
2703         request = compat_urllib_request.Request(complete_url)
2704         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2705         cookie = urlh.headers.get('Set-Cookie', '')
2706
2707         self.report_extraction(track_id)
2708
2709         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2710             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2711         try:
2712             track_list = json.loads(html_tracks)
2713             track = track_list[u'tracks'][0]
2714         except ValueError:
2715             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2716
2717         key = track[u"key"]
2718         track_id = track[u"id"]
2719         artist = track[u"artist"]
2720         title = track[u"song"]
2721
2722         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2723         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2724         request.add_header('cookie', cookie)
2725         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2726         try:
2727             song_data = json.loads(song_data_json)
2728         except ValueError:
2729             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2730         final_url = song_data[u"url"]
2731
2732         return [{
2733             'id':       track_id,
2734             'url':      final_url,
2735             'ext':      "mp3",
2736             'title':    title,
2737             'artist':   artist,
2738         }]
2739
2740 class Vbox7IE(InfoExtractor):
2741     """Information Extractor for Vbox7"""
2742     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2743
2744     def _real_extract(self,url):
2745         mobj = re.match(self._VALID_URL, url)
2746         if mobj is None:
2747             raise ExtractorError(u'Invalid URL: %s' % url)
2748         video_id = mobj.group(1)
2749
2750         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2751         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2752         redirect_url = urlh.geturl() + new_location
2753         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2754
2755         title = self._html_search_regex(r'<title>(.*)</title>',
2756             webpage, u'title').split('/')[0].strip()
2757
2758         ext = "flv"
2759         info_url = "http://vbox7.com/play/magare.do"
2760         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2761         info_request = compat_urllib_request.Request(info_url, data)
2762         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2763         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2764         if info_response is None:
2765             raise ExtractorError(u'Unable to extract the media url')
2766         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2767
2768         return [{
2769             'id':        video_id,
2770             'url':       final_url,
2771             'ext':       ext,
2772             'title':     title,
2773             'thumbnail': thumbnail_url,
2774         }]
2775
2776
2777 def gen_extractors():
2778     """ Return a list of an instance of every supported extractor.
2779     The order does matter; the first extractor matched is the one handling the URL.
2780     """
2781     return [
2782         YoutubePlaylistIE(),
2783         YoutubeChannelIE(),
2784         YoutubeUserIE(),
2785         YoutubeSearchIE(),
2786         YoutubeIE(),
2787         MetacafeIE(),
2788         DailymotionIE(),
2789         GoogleSearchIE(),
2790         PhotobucketIE(),
2791         YahooIE(),
2792         YahooSearchIE(),
2793         DepositFilesIE(),
2794         FacebookIE(),
2795         BlipTVIE(),
2796         BlipTVUserIE(),
2797         VimeoIE(),
2798         MyVideoIE(),
2799         ComedyCentralIE(),
2800         EscapistIE(),
2801         CollegeHumorIE(),
2802         XVideosIE(),
2803         SoundcloudSetIE(),
2804         SoundcloudIE(),
2805         InfoQIE(),
2806         MixcloudIE(),
2807         StanfordOpenClassroomIE(),
2808         MTVIE(),
2809         YoukuIE(),
2810         XNXXIE(),
2811         YouJizzIE(),
2812         PornotubeIE(),
2813         YouPornIE(),
2814         GooglePlusIE(),
2815         ArteTvIE(),
2816         NBAIE(),
2817         WorldStarHipHopIE(),
2818         JustinTVIE(),
2819         FunnyOrDieIE(),
2820         SteamIE(),
2821         UstreamIE(),
2822         RBMARadioIE(),
2823         EightTracksIE(),
2824         KeekIE(),
2825         TEDIE(),
2826         MySpassIE(),
2827         SpiegelIE(),
2828         LiveLeakIE(),
2829         ARDIE(),
2830         ZDFIE(),
2831         TumblrIE(),
2832         BandcampIE(),
2833         RedTubeIE(),
2834         InaIE(),
2835         HowcastIE(),
2836         VineIE(),
2837         FlickrIE(),
2838         TeamcocoIE(),
2839         XHamsterIE(),
2840         HypemIE(),
2841         Vbox7IE(),
2842         GametrailersIE(),
2843         StatigramIE(),
2844         GenericIE()
2845     ]
2846
2847 def get_info_extractor(ie_name):
2848     """Returns the info extractor class with the given ie_name"""
2849     return globals()[ie_name+'IE']