]> git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py
7c9f09f77b6d093e62e91735cf8ee8bb01db7408
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24     """Information Extractor class.
25
26     Information extractors are the classes that, given a URL, extract
27     information about the video (or videos) the URL refers to. This
28     information includes the real video URL, the video title, author and
29     others. The information is stored in a dictionary which is then
30     passed to the FileDownloader. The FileDownloader processes this
31     information possibly downloading the video to the file system, among
32     other possible outcomes.
33
34     The dictionaries must include the following fields:
35
36     id:             Video identifier.
37     url:            Final video URL.
38     title:          Video title, unescaped.
39     ext:            Video filename extension.
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader:       Full name of the video uploader.
47     upload_date:    Video upload date (YYYYMMDD).
48     uploader_id:    Nickname or id of the video uploader.
49     location:       Physical location of the video.
50     player_url:     SWF Player URL (used for rtmpdump).
51     subtitles:      The subtitle file contents.
52     urlhandle:      [internal] The urlHandle to be used to download the file,
53                     like returned by urllib.request.urlopen
54
55     The fields should all be Unicode strings.
56
57     Subclasses of this one should re-define the _real_initialize() and
58     _real_extract() methods and define a _VALID_URL regexp.
59     Probably, they should also be added to the list of extractors.
60
61     _real_extract() must return a *list* of information dictionaries as
62     described above.
63
64     Finally, the _WORKING attribute should be set to False for broken IEs
65     in order to warn the users and skip the tests.
66     """
67
68     _ready = False
69     _downloader = None
70     _WORKING = True
71
72     def __init__(self, downloader=None):
73         """Constructor. Receives an optional downloader."""
74         self._ready = False
75         self.set_downloader(downloader)
76
77     @classmethod
78     def suitable(cls, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(cls._VALID_URL, url) is not None
81
82     @classmethod
83     def working(cls):
84         """Getter method for _WORKING."""
85         return cls._WORKING
86
87     def initialize(self):
88         """Initializes an instance (authentication, etc)."""
89         if not self._ready:
90             self._real_initialize()
91             self._ready = True
92
93     def extract(self, url):
94         """Extracts URL information and returns it in list of dicts."""
95         self.initialize()
96         return self._real_extract(url)
97
98     def set_downloader(self, downloader):
99         """Sets the downloader for this IE."""
100         self._downloader = downloader
101
102     def _real_initialize(self):
103         """Real initialization process. Redefine in subclasses."""
104         pass
105
106     def _real_extract(self, url):
107         """Real extraction process. Redefine in subclasses."""
108         pass
109
110     @property
111     def IE_NAME(self):
112         return type(self).__name__[:-2]
113
114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115         """ Returns the response handle """
116         if note is None:
117             note = u'Downloading video webpage'
118         if note is not False:
119             self.to_screen(u'%s: %s' % (video_id, note))
120         try:
121             return compat_urllib_request.urlopen(url_or_request)
122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123             if errnote is None:
124                 errnote = u'Unable to download webpage'
125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128         """ Returns the data of the page as a string """
129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130         content_type = urlh.headers.get('Content-Type', '')
131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132         if m:
133             encoding = m.group(1)
134         else:
135             encoding = 'utf-8'
136         webpage_bytes = urlh.read()
137         if self._downloader.params.get('dump_intermediate_pages', False):
138             try:
139                 url = url_or_request.get_full_url()
140             except AttributeError:
141                 url = url_or_request
142             self.to_screen(u'Dumping request to ' + url)
143             dump = base64.b64encode(webpage_bytes).decode('ascii')
144             self._downloader.to_screen(dump)
145         return webpage_bytes.decode(encoding, 'replace')
146
147     def to_screen(self, msg):
148         """Print msg to screen, prefixing it with '[ie_name]'"""
149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151     def report_extraction(self, id_or_name):
152         """Report information extraction."""
153         self.to_screen(u'%s: Extracting information' % id_or_name)
154
155     #Methods for following #608
156     #They set the correct value of the '_type' key
157     def video_result(self, video_info):
158         """Returns a video"""
159         video_info['_type'] = 'video'
160         return video_info
161     def url_result(self, url, ie=None):
162         """Returns a url that points to a page that should be processed"""
163         #TODO: ie should be the class used for getting the info
164         video_info = {'_type': 'url',
165                       'url': url,
166                       'ie_key': ie}
167         return video_info
168     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
169         """Returns a playlist"""
170         video_info = {'_type': 'playlist',
171                       'entries': entries}
172         if playlist_id:
173             video_info['id'] = playlist_id
174         if playlist_title:
175             video_info['title'] = playlist_title
176         return video_info
177
178
179 class YoutubeIE(InfoExtractor):
180     """Information extractor for youtube.com."""
181
182     _VALID_URL = r"""^
183                      (
184                          (?:https?://)?                                       # http(s):// (optional)
185                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
186                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
187                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
188                          (?:                                                  # the various things that can precede the ID:
189                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
190                              |(?:                                             # or the v= param in all its forms
191                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
192                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
193                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
194                                  v=
195                              )
196                          )?                                                   # optional -> youtube.com/xxxx is OK
197                      )?                                                       # all until now is optional -> you can pass the naked ID
198                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
199                      (?(1).+)?                                                # if we found the ID, everything can follow
200                      $"""
201     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
202     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
203     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
204     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
205     _NETRC_MACHINE = 'youtube'
206     # Listed in order of quality
207     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
208     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
209     _video_extensions = {
210         '13': '3gp',
211         '17': 'mp4',
212         '18': 'mp4',
213         '22': 'mp4',
214         '37': 'mp4',
215         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
216         '43': 'webm',
217         '44': 'webm',
218         '45': 'webm',
219         '46': 'webm',
220     }
221     _video_dimensions = {
222         '5': '240x400',
223         '6': '???',
224         '13': '???',
225         '17': '144x176',
226         '18': '360x640',
227         '22': '720x1280',
228         '34': '360x640',
229         '35': '480x854',
230         '37': '1080x1920',
231         '38': '3072x4096',
232         '43': '360x640',
233         '44': '480x854',
234         '45': '720x1280',
235         '46': '1080x1920',
236     }
237     IE_NAME = u'youtube'
238
239     @classmethod
240     def suitable(cls, url):
241         """Receives a URL and returns True if suitable for this IE."""
242         if YoutubePlaylistIE.suitable(url): return False
243         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
244
245     def report_lang(self):
246         """Report attempt to set language."""
247         self.to_screen(u'Setting language')
248
249     def report_login(self):
250         """Report attempt to log in."""
251         self.to_screen(u'Logging in')
252
253     def report_age_confirmation(self):
254         """Report attempt to confirm age."""
255         self.to_screen(u'Confirming age')
256
257     def report_video_webpage_download(self, video_id):
258         """Report attempt to download video webpage."""
259         self.to_screen(u'%s: Downloading video webpage' % video_id)
260
261     def report_video_info_webpage_download(self, video_id):
262         """Report attempt to download video info webpage."""
263         self.to_screen(u'%s: Downloading video info webpage' % video_id)
264
265     def report_video_subtitles_download(self, video_id):
266         """Report attempt to download video info webpage."""
267         self.to_screen(u'%s: Checking available subtitles' % video_id)
268
269     def report_video_subtitles_request(self, video_id, sub_lang, format):
270         """Report attempt to download video info webpage."""
271         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
272
273     def report_video_subtitles_available(self, video_id, sub_lang_list):
274         """Report available subtitles."""
275         sub_lang = ",".join(list(sub_lang_list.keys()))
276         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
277
278     def report_information_extraction(self, video_id):
279         """Report attempt to extract video information."""
280         self.to_screen(u'%s: Extracting video information' % video_id)
281
282     def report_unavailable_format(self, video_id, format):
283         """Report extracted video URL."""
284         self.to_screen(u'%s: Format %s not available' % (video_id, format))
285
286     def report_rtmp_download(self):
287         """Indicate the download will use the RTMP protocol."""
288         self.to_screen(u'RTMP download detected')
289
290     def _get_available_subtitles(self, video_id):
291         self.report_video_subtitles_download(video_id)
292         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
293         try:
294             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296             return (u'unable to download video subtitles: %s' % compat_str(err), None)
297         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299         if not sub_lang_list:
300             return (u'video doesn\'t have subtitles', None)
301         return sub_lang_list
302
303     def _list_available_subtitles(self, video_id):
304         sub_lang_list = self._get_available_subtitles(video_id)
305         self.report_video_subtitles_available(video_id, sub_lang_list)
306
307     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
308         """
309         Return tuple:
310         (error_message, sub_lang, sub)
311         """
312         self.report_video_subtitles_request(video_id, sub_lang, format)
313         params = compat_urllib_parse.urlencode({
314             'lang': sub_lang,
315             'name': sub_name,
316             'v': video_id,
317             'fmt': format,
318         })
319         url = 'http://www.youtube.com/api/timedtext?' + params
320         try:
321             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
324         if not sub:
325             return (u'Did not fetch video subtitles', None, None)
326         return (None, sub_lang, sub)
327
328     def _extract_subtitle(self, video_id):
329         """
330         Return a list with a tuple:
331         [(error_message, sub_lang, sub)]
332         """
333         sub_lang_list = self._get_available_subtitles(video_id)
334         sub_format = self._downloader.params.get('subtitlesformat')
335         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336             return [(sub_lang_list[0], None, None)]
337         if self._downloader.params.get('subtitleslang', False):
338             sub_lang = self._downloader.params.get('subtitleslang')
339         elif 'en' in sub_lang_list:
340             sub_lang = 'en'
341         else:
342             sub_lang = list(sub_lang_list.keys())[0]
343         if not sub_lang in sub_lang_list:
344             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
345
346         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
347         return [subtitle]
348
349     def _extract_all_subtitles(self, video_id):
350         sub_lang_list = self._get_available_subtitles(video_id)
351         sub_format = self._downloader.params.get('subtitlesformat')
352         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353             return [(sub_lang_list[0], None, None)]
354         subtitles = []
355         for sub_lang in sub_lang_list:
356             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357             subtitles.append(subtitle)
358         return subtitles
359
360     def _print_formats(self, formats):
361         print('Available formats:')
362         for x in formats:
363             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
364
365     def _real_initialize(self):
366         if self._downloader is None:
367             return
368
369         username = None
370         password = None
371         downloader_params = self._downloader.params
372
373         # Attempt to use provided username and password or .netrc data
374         if downloader_params.get('username', None) is not None:
375             username = downloader_params['username']
376             password = downloader_params['password']
377         elif downloader_params.get('usenetrc', False):
378             try:
379                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
380                 if info is not None:
381                     username = info[0]
382                     password = info[2]
383                 else:
384                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385             except (IOError, netrc.NetrcParseError) as err:
386                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
387                 return
388
389         # Set language
390         request = compat_urllib_request.Request(self._LANG_URL)
391         try:
392             self.report_lang()
393             compat_urllib_request.urlopen(request).read()
394         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
396             return
397
398         # No authentication to be performed
399         if username is None:
400             return
401
402         request = compat_urllib_request.Request(self._LOGIN_URL)
403         try:
404             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
407             return
408
409         galx = None
410         dsh = None
411         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
412         if match:
413           galx = match.group(1)
414
415         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
416         if match:
417           dsh = match.group(1)
418
419         # Log in
420         login_form_strs = {
421                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
422                 u'Email': username,
423                 u'GALX': galx,
424                 u'Passwd': password,
425                 u'PersistentCookie': u'yes',
426                 u'_utf8': u'霱',
427                 u'bgresponse': u'js_disabled',
428                 u'checkConnection': u'',
429                 u'checkedDomains': u'youtube',
430                 u'dnConn': u'',
431                 u'dsh': dsh,
432                 u'pstMsg': u'0',
433                 u'rmShown': u'1',
434                 u'secTok': u'',
435                 u'signIn': u'Sign in',
436                 u'timeStmp': u'',
437                 u'service': u'youtube',
438                 u'uilel': u'3',
439                 u'hl': u'en_US',
440         }
441         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
442         # chokes on unicode
443         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
446         try:
447             self.report_login()
448             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450                 self._downloader.report_warning(u'unable to log in: bad username or password')
451                 return
452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
454             return
455
456         # Confirm age
457         age_form = {
458                 'next_url':     '/',
459                 'action_confirm':   'Confirm',
460                 }
461         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
462         try:
463             self.report_age_confirmation()
464             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
467             return
468
469     def _extract_id(self, url):
470         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
471         if mobj is None:
472             self._downloader.report_error(u'invalid URL: %s' % url)
473             return
474         video_id = mobj.group(2)
475         return video_id
476
477     def _real_extract(self, url):
478         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479         mobj = re.search(self._NEXT_URL_RE, url)
480         if mobj:
481             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482         video_id = self._extract_id(url)
483
484         # Get video webpage
485         self.report_video_webpage_download(video_id)
486         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487         request = compat_urllib_request.Request(url)
488         try:
489             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
492             return
493
494         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
495
496         # Attempt to extract SWF player URL
497         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
498         if mobj is not None:
499             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
500         else:
501             player_url = None
502
503         # Get video info
504         self.report_video_info_webpage_download(video_id)
505         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507                     % (video_id, el_type))
508             video_info_webpage = self._download_webpage(video_info_url, video_id,
509                                     note=False,
510                                     errnote='unable to download video info webpage')
511             video_info = compat_parse_qs(video_info_webpage)
512             if 'token' in video_info:
513                 break
514         if 'token' not in video_info:
515             if 'reason' in video_info:
516                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
517             else:
518                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
519             return
520
521         # Check for "rental" videos
522         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523             self._downloader.report_error(u'"rental" videos not supported')
524             return
525
526         # Start extracting information
527         self.report_information_extraction(video_id)
528
529         # uploader
530         if 'author' not in video_info:
531             self._downloader.report_error(u'unable to extract uploader name')
532             return
533         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
534
535         # uploader_id
536         video_uploader_id = None
537         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
538         if mobj is not None:
539             video_uploader_id = mobj.group(1)
540         else:
541             self._downloader.report_warning(u'unable to extract uploader nickname')
542
543         # title
544         if 'title' not in video_info:
545             self._downloader.report_error(u'unable to extract video title')
546             return
547         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
548
549         # thumbnail image
550         if 'thumbnail_url' not in video_info:
551             self._downloader.report_warning(u'unable to extract video thumbnail')
552             video_thumbnail = ''
553         else:   # don't panic if we can't find it
554             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
555
556         # upload date
557         upload_date = None
558         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
559         if mobj is not None:
560             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562             for expression in format_expressions:
563                 try:
564                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
565                 except:
566                     pass
567
568         # description
569         video_description = get_element_by_id("eow-description", video_webpage)
570         if video_description:
571             video_description = clean_html(video_description)
572         else:
573             video_description = ''
574
575         # subtitles
576         video_subtitles = None
577
578         if self._downloader.params.get('writesubtitles', False):
579             video_subtitles = self._extract_subtitle(video_id)
580             if video_subtitles:
581                 (sub_error, sub_lang, sub) = video_subtitles[0]
582                 if sub_error:
583                     self._downloader.report_error(sub_error)
584
585         if self._downloader.params.get('allsubtitles', False):
586             video_subtitles = self._extract_all_subtitles(video_id)
587             for video_subtitle in video_subtitles:
588                 (sub_error, sub_lang, sub) = video_subtitle
589                 if sub_error:
590                     self._downloader.report_error(sub_error)
591
592         if self._downloader.params.get('listsubtitles', False):
593             sub_lang_list = self._list_available_subtitles(video_id)
594             return
595
596         if 'length_seconds' not in video_info:
597             self._downloader.report_warning(u'unable to extract video duration')
598             video_duration = ''
599         else:
600             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
601
602         # token
603         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
604
605         # Decide which formats to download
606         req_format = self._downloader.params.get('format', None)
607
608         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
609             self.report_rtmp_download()
610             video_url_list = [(None, video_info['conn'][0])]
611         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
612             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
613             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
614             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
615             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
616
617             format_limit = self._downloader.params.get('format_limit', None)
618             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
619             if format_limit is not None and format_limit in available_formats:
620                 format_list = available_formats[available_formats.index(format_limit):]
621             else:
622                 format_list = available_formats
623             existing_formats = [x for x in format_list if x in url_map]
624             if len(existing_formats) == 0:
625                 self._downloader.report_error(u'no known formats available for video')
626                 return
627             if self._downloader.params.get('listformats', None):
628                 self._print_formats(existing_formats)
629                 return
630             if req_format is None or req_format == 'best':
631                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632             elif req_format == 'worst':
633                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634             elif req_format in ('-1', 'all'):
635                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
636             else:
637                 # Specific formats. We pick the first in a slash-delimeted sequence.
638                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639                 req_formats = req_format.split('/')
640                 video_url_list = None
641                 for rf in req_formats:
642                     if rf in url_map:
643                         video_url_list = [(rf, url_map[rf])]
644                         break
645                 if video_url_list is None:
646                     self._downloader.report_error(u'requested format not available')
647                     return
648         else:
649             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
650             return
651
652         results = []
653         for format_param, video_real_url in video_url_list:
654             # Extension
655             video_extension = self._video_extensions.get(format_param, 'flv')
656
657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658                                               self._video_dimensions.get(format_param, '???'))
659
660             results.append({
661                 'id':       video_id,
662                 'url':      video_real_url,
663                 'uploader': video_uploader,
664                 'uploader_id': video_uploader_id,
665                 'upload_date':  upload_date,
666                 'title':    video_title,
667                 'ext':      video_extension,
668                 'format':   video_format,
669                 'thumbnail':    video_thumbnail,
670                 'description':  video_description,
671                 'player_url':   player_url,
672                 'subtitles':    video_subtitles,
673                 'duration':     video_duration
674             })
675         return results
676
677
678 class MetacafeIE(InfoExtractor):
679     """Information Extractor for metacafe.com."""
680
681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684     IE_NAME = u'metacafe'
685
686     def __init__(self, downloader=None):
687         InfoExtractor.__init__(self, downloader)
688
689     def report_disclaimer(self):
690         """Report disclaimer retrieval."""
691         self.to_screen(u'Retrieving disclaimer')
692
693     def report_age_confirmation(self):
694         """Report attempt to confirm age."""
695         self.to_screen(u'Confirming age')
696
697     def report_download_webpage(self, video_id):
698         """Report webpage download."""
699         self.to_screen(u'%s: Downloading webpage' % video_id)
700
701     def _real_initialize(self):
702         # Retrieve disclaimer
703         request = compat_urllib_request.Request(self._DISCLAIMER)
704         try:
705             self.report_disclaimer()
706             disclaimer = compat_urllib_request.urlopen(request).read()
707         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
708             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
709             return
710
711         # Confirm age
712         disclaimer_form = {
713             'filters': '0',
714             'submit': "Continue - I'm over 18",
715             }
716         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
717         try:
718             self.report_age_confirmation()
719             disclaimer = compat_urllib_request.urlopen(request).read()
720         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
721             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
722             return
723
724     def _real_extract(self, url):
725         # Extract id and simplified title from URL
726         mobj = re.match(self._VALID_URL, url)
727         if mobj is None:
728             self._downloader.report_error(u'invalid URL: %s' % url)
729             return
730
731         video_id = mobj.group(1)
732
733         # Check if video comes from YouTube
734         mobj2 = re.match(r'^yt-(.*)$', video_id)
735         if mobj2 is not None:
736             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
737
738         # Retrieve video webpage to extract further information
739         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
740
741         # Extract URL, uploader and title from webpage
742         self.report_extraction(video_id)
743         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
744         if mobj is not None:
745             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
746             video_extension = mediaURL[-3:]
747
748             # Extract gdaKey if available
749             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
750             if mobj is None:
751                 video_url = mediaURL
752             else:
753                 gdaKey = mobj.group(1)
754                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
755         else:
756             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
757             if mobj is None:
758                 self._downloader.report_error(u'unable to extract media URL')
759                 return
760             vardict = compat_parse_qs(mobj.group(1))
761             if 'mediaData' not in vardict:
762                 self._downloader.report_error(u'unable to extract media URL')
763                 return
764             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
765             if mobj is None:
766                 self._downloader.report_error(u'unable to extract media URL')
767                 return
768             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
769             video_extension = mediaURL[-3:]
770             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
771
772         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
773         if mobj is None:
774             self._downloader.report_error(u'unable to extract title')
775             return
776         video_title = mobj.group(1).decode('utf-8')
777
778         mobj = re.search(r'submitter=(.*?);', webpage)
779         if mobj is None:
780             self._downloader.report_error(u'unable to extract uploader nickname')
781             return
782         video_uploader = mobj.group(1)
783
784         return [{
785             'id':       video_id.decode('utf-8'),
786             'url':      video_url.decode('utf-8'),
787             'uploader': video_uploader.decode('utf-8'),
788             'upload_date':  None,
789             'title':    video_title,
790             'ext':      video_extension.decode('utf-8'),
791         }]
792
793
794 class DailymotionIE(InfoExtractor):
795     """Information Extractor for Dailymotion"""
796
797     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
798     IE_NAME = u'dailymotion'
799     _WORKING = False
800
801     def __init__(self, downloader=None):
802         InfoExtractor.__init__(self, downloader)
803
804     def _real_extract(self, url):
805         # Extract id and simplified title from URL
806         mobj = re.match(self._VALID_URL, url)
807         if mobj is None:
808             self._downloader.report_error(u'invalid URL: %s' % url)
809             return
810
811         video_id = mobj.group(1).split('_')[0].split('?')[0]
812
813         video_extension = 'mp4'
814
815         # Retrieve video webpage to extract further information
816         request = compat_urllib_request.Request(url)
817         request.add_header('Cookie', 'family_filter=off')
818         webpage = self._download_webpage(request, video_id)
819
820         # Extract URL, uploader and title from webpage
821         self.report_extraction(video_id)
822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
823         if mobj is None:
824             self._downloader.report_error(u'unable to extract media URL')
825             return
826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
827
828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
829             if key in flashvars:
830                 max_quality = key
831                 self.to_screen(u'Using %s' % key)
832                 break
833         else:
834             self._downloader.report_error(u'unable to extract video URL')
835             return
836
837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
838         if mobj is None:
839             self._downloader.report_error(u'unable to extract video URL')
840             return
841
842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
843
844         # TODO: support choosing qualities
845
846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
847         if mobj is None:
848             self._downloader.report_error(u'unable to extract title')
849             return
850         video_title = unescapeHTML(mobj.group('title'))
851
852         video_uploader = None
853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
854         if mobj is None:
855             # lookin for official user
856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857             if mobj_official is None:
858                 self._downloader.report_warning(u'unable to extract uploader nickname')
859             else:
860                 video_uploader = mobj_official.group(1)
861         else:
862             video_uploader = mobj.group(1)
863
864         video_upload_date = None
865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
866         if mobj is not None:
867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
868
869         return [{
870             'id':       video_id,
871             'url':      video_url,
872             'uploader': video_uploader,
873             'upload_date':  video_upload_date,
874             'title':    video_title,
875             'ext':      video_extension,
876         }]
877
878
879 class PhotobucketIE(InfoExtractor):
880     """Information extractor for photobucket.com."""
881
882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883     IE_NAME = u'photobucket'
884
885     def __init__(self, downloader=None):
886         InfoExtractor.__init__(self, downloader)
887
888     def report_download_webpage(self, video_id):
889         """Report webpage download."""
890         self.to_screen(u'%s: Downloading webpage' % video_id)
891
892     def _real_extract(self, url):
893         # Extract id from URL
894         mobj = re.match(self._VALID_URL, url)
895         if mobj is None:
896             self._downloader.report_error(u'Invalid URL: %s' % url)
897             return
898
899         video_id = mobj.group(1)
900
901         video_extension = 'flv'
902
903         # Retrieve video webpage to extract further information
904         request = compat_urllib_request.Request(url)
905         try:
906             self.report_download_webpage(video_id)
907             webpage = compat_urllib_request.urlopen(request).read()
908         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
909             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
910             return
911
912         # Extract URL, uploader, and title from webpage
913         self.report_extraction(video_id)
914         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
915         if mobj is None:
916             self._downloader.report_error(u'unable to extract media URL')
917             return
918         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
919
920         video_url = mediaURL
921
922         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
923         if mobj is None:
924             self._downloader.report_error(u'unable to extract title')
925             return
926         video_title = mobj.group(1).decode('utf-8')
927
928         video_uploader = mobj.group(2).decode('utf-8')
929
930         return [{
931             'id':       video_id.decode('utf-8'),
932             'url':      video_url.decode('utf-8'),
933             'uploader': video_uploader,
934             'upload_date':  None,
935             'title':    video_title,
936             'ext':      video_extension.decode('utf-8'),
937         }]
938
939
940 class YahooIE(InfoExtractor):
941     """Information extractor for video.yahoo.com."""
942
943     _WORKING = False
944     # _VALID_URL matches all Yahoo! Video URLs
945     # _VPAGE_URL matches only the extractable '/watch/' URLs
946     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
947     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
948     IE_NAME = u'video.yahoo'
949
950     def __init__(self, downloader=None):
951         InfoExtractor.__init__(self, downloader)
952
953     def report_download_webpage(self, video_id):
954         """Report webpage download."""
955         self.to_screen(u'%s: Downloading webpage' % video_id)
956
957     def _real_extract(self, url, new_video=True):
958         # Extract ID from URL
959         mobj = re.match(self._VALID_URL, url)
960         if mobj is None:
961             self._downloader.report_error(u'Invalid URL: %s' % url)
962             return
963
964         video_id = mobj.group(2)
965         video_extension = 'flv'
966
967         # Rewrite valid but non-extractable URLs as
968         # extractable English language /watch/ URLs
969         if re.match(self._VPAGE_URL, url) is None:
970             request = compat_urllib_request.Request(url)
971             try:
972                 webpage = compat_urllib_request.urlopen(request).read()
973             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
974                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
975                 return
976
977             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
978             if mobj is None:
979                 self._downloader.report_error(u'Unable to extract id field')
980                 return
981             yahoo_id = mobj.group(1)
982
983             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
984             if mobj is None:
985                 self._downloader.report_error(u'Unable to extract vid field')
986                 return
987             yahoo_vid = mobj.group(1)
988
989             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
990             return self._real_extract(url, new_video=False)
991
992         # Retrieve video webpage to extract further information
993         request = compat_urllib_request.Request(url)
994         try:
995             self.report_download_webpage(video_id)
996             webpage = compat_urllib_request.urlopen(request).read()
997         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
998             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
999             return
1000
1001         # Extract uploader and title from webpage
1002         self.report_extraction(video_id)
1003         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1004         if mobj is None:
1005             self._downloader.report_error(u'unable to extract video title')
1006             return
1007         video_title = mobj.group(1).decode('utf-8')
1008
1009         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video uploader')
1012             return
1013         video_uploader = mobj.group(1).decode('utf-8')
1014
1015         # Extract video thumbnail
1016         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1017         if mobj is None:
1018             self._downloader.report_error(u'unable to extract video thumbnail')
1019             return
1020         video_thumbnail = mobj.group(1).decode('utf-8')
1021
1022         # Extract video description
1023         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1024         if mobj is None:
1025             self._downloader.report_error(u'unable to extract video description')
1026             return
1027         video_description = mobj.group(1).decode('utf-8')
1028         if not video_description:
1029             video_description = 'No description available.'
1030
1031         # Extract video height and width
1032         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1033         if mobj is None:
1034             self._downloader.report_error(u'unable to extract video height')
1035             return
1036         yv_video_height = mobj.group(1)
1037
1038         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1039         if mobj is None:
1040             self._downloader.report_error(u'unable to extract video width')
1041             return
1042         yv_video_width = mobj.group(1)
1043
1044         # Retrieve video playlist to extract media URL
1045         # I'm not completely sure what all these options are, but we
1046         # seem to need most of them, otherwise the server sends a 401.
1047         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1048         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1049         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1050                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1051                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1052         try:
1053             self.report_download_webpage(video_id)
1054             webpage = compat_urllib_request.urlopen(request).read()
1055         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1056             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1057             return
1058
1059         # Extract media URL from playlist XML
1060         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1061         if mobj is None:
1062             self._downloader.report_error(u'Unable to extract media URL')
1063             return
1064         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1065         video_url = unescapeHTML(video_url)
1066
1067         return [{
1068             'id':       video_id.decode('utf-8'),
1069             'url':      video_url,
1070             'uploader': video_uploader,
1071             'upload_date':  None,
1072             'title':    video_title,
1073             'ext':      video_extension.decode('utf-8'),
1074             'thumbnail':    video_thumbnail.decode('utf-8'),
1075             'description':  video_description,
1076         }]
1077
1078
1079 class VimeoIE(InfoExtractor):
1080     """Information extractor for vimeo.com."""
1081
1082     # _VALID_URL matches Vimeo URLs
1083     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1084     IE_NAME = u'vimeo'
1085
1086     def __init__(self, downloader=None):
1087         InfoExtractor.__init__(self, downloader)
1088
1089     def report_download_webpage(self, video_id):
1090         """Report webpage download."""
1091         self.to_screen(u'%s: Downloading webpage' % video_id)
1092
1093     def _real_extract(self, url, new_video=True):
1094         # Extract ID from URL
1095         mobj = re.match(self._VALID_URL, url)
1096         if mobj is None:
1097             self._downloader.report_error(u'Invalid URL: %s' % url)
1098             return
1099
1100         video_id = mobj.group('id')
1101         if not mobj.group('proto'):
1102             url = 'https://' + url
1103         if mobj.group('direct_link'):
1104             url = 'https://vimeo.com/' + video_id
1105
1106         # Retrieve video webpage to extract further information
1107         request = compat_urllib_request.Request(url, None, std_headers)
1108         try:
1109             self.report_download_webpage(video_id)
1110             webpage_bytes = compat_urllib_request.urlopen(request).read()
1111             webpage = webpage_bytes.decode('utf-8')
1112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1114             return
1115
1116         # Now we begin extracting as much information as we can from what we
1117         # retrieved. First we extract the information common to all extractors,
1118         # and latter we extract those that are Vimeo specific.
1119         self.report_extraction(video_id)
1120
1121         # Extract the config JSON
1122         try:
1123             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1124             config = json.loads(config)
1125         except:
1126             self._downloader.report_error(u'unable to extract info section')
1127             return
1128
1129         # Extract title
1130         video_title = config["video"]["title"]
1131
1132         # Extract uploader and uploader_id
1133         video_uploader = config["video"]["owner"]["name"]
1134         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1135
1136         # Extract video thumbnail
1137         video_thumbnail = config["video"]["thumbnail"]
1138
1139         # Extract video description
1140         video_description = get_element_by_attribute("itemprop", "description", webpage)
1141         if video_description: video_description = clean_html(video_description)
1142         else: video_description = u''
1143
1144         # Extract upload date
1145         video_upload_date = None
1146         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1147         if mobj is not None:
1148             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1149
1150         # Vimeo specific: extract request signature and timestamp
1151         sig = config['request']['signature']
1152         timestamp = config['request']['timestamp']
1153
1154         # Vimeo specific: extract video codec and quality information
1155         # First consider quality, then codecs, then take everything
1156         # TODO bind to format param
1157         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1158         files = { 'hd': [], 'sd': [], 'other': []}
1159         for codec_name, codec_extension in codecs:
1160             if codec_name in config["video"]["files"]:
1161                 if 'hd' in config["video"]["files"][codec_name]:
1162                     files['hd'].append((codec_name, codec_extension, 'hd'))
1163                 elif 'sd' in config["video"]["files"][codec_name]:
1164                     files['sd'].append((codec_name, codec_extension, 'sd'))
1165                 else:
1166                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1167
1168         for quality in ('hd', 'sd', 'other'):
1169             if len(files[quality]) > 0:
1170                 video_quality = files[quality][0][2]
1171                 video_codec = files[quality][0][0]
1172                 video_extension = files[quality][0][1]
1173                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1174                 break
1175         else:
1176             self._downloader.report_error(u'no known codec found')
1177             return
1178
1179         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181
1182         return [{
1183             'id':       video_id,
1184             'url':      video_url,
1185             'uploader': video_uploader,
1186             'uploader_id': video_uploader_id,
1187             'upload_date':  video_upload_date,
1188             'title':    video_title,
1189             'ext':      video_extension,
1190             'thumbnail':    video_thumbnail,
1191             'description':  video_description,
1192         }]
1193
1194
1195 class ArteTvIE(InfoExtractor):
1196     """arte.tv information extractor."""
1197
1198     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199     _LIVE_URL = r'index-[0-9]+\.html$'
1200
1201     IE_NAME = u'arte.tv'
1202
1203     def __init__(self, downloader=None):
1204         InfoExtractor.__init__(self, downloader)
1205
1206     def report_download_webpage(self, video_id):
1207         """Report webpage download."""
1208         self.to_screen(u'%s: Downloading webpage' % video_id)
1209
1210     def fetch_webpage(self, url):
1211         request = compat_urllib_request.Request(url)
1212         try:
1213             self.report_download_webpage(url)
1214             webpage = compat_urllib_request.urlopen(request).read()
1215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1217             return
1218         except ValueError as err:
1219             self._downloader.report_error(u'Invalid URL: %s' % url)
1220             return
1221         return webpage
1222
1223     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224         page = self.fetch_webpage(url)
1225         mobj = re.search(regex, page, regexFlags)
1226         info = {}
1227
1228         if mobj is None:
1229             self._downloader.report_error(u'Invalid URL: %s' % url)
1230             return
1231
1232         for (i, key, err) in matchTuples:
1233             if mobj.group(i) is None:
1234                 self._downloader.trouble(err)
1235                 return
1236             else:
1237                 info[key] = mobj.group(i)
1238
1239         return info
1240
1241     def extractLiveStream(self, url):
1242         video_lang = url.split('/')[-4]
1243         info = self.grep_webpage(
1244             url,
1245             r'src="(.*?/videothek_js.*?\.js)',
1246             0,
1247             [
1248                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1249             ]
1250         )
1251         http_host = url.split('/')[2]
1252         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253         info = self.grep_webpage(
1254             next_url,
1255             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256                 '(http://.*?\.swf).*?' +
1257                 '(rtmp://.*?)\'',
1258             re.DOTALL,
1259             [
1260                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1261                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1263             ]
1264         )
1265         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1266
1267     def extractPlus7Stream(self, url):
1268         video_lang = url.split('/')[-3]
1269         info = self.grep_webpage(
1270             url,
1271             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1272             0,
1273             [
1274                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1275             ]
1276         )
1277         next_url = compat_urllib_parse.unquote(info.get('url'))
1278         info = self.grep_webpage(
1279             next_url,
1280             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1281             0,
1282             [
1283                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1284             ]
1285         )
1286         next_url = compat_urllib_parse.unquote(info.get('url'))
1287
1288         info = self.grep_webpage(
1289             next_url,
1290             r'<video id="(.*?)".*?>.*?' +
1291                 '<name>(.*?)</name>.*?' +
1292                 '<dateVideo>(.*?)</dateVideo>.*?' +
1293                 '<url quality="hd">(.*?)</url>',
1294             re.DOTALL,
1295             [
1296                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1297                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1299                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1300             ]
1301         )
1302
1303         return {
1304             'id':           info.get('id'),
1305             'url':          compat_urllib_parse.unquote(info.get('url')),
1306             'uploader':     u'arte.tv',
1307             'upload_date':  info.get('date'),
1308             'title':        info.get('title').decode('utf-8'),
1309             'ext':          u'mp4',
1310             'format':       u'NA',
1311             'player_url':   None,
1312         }
1313
1314     def _real_extract(self, url):
1315         video_id = url.split('/')[-1]
1316         self.report_extraction(video_id)
1317
1318         if re.search(self._LIVE_URL, video_id) is not None:
1319             self.extractLiveStream(url)
1320             return
1321         else:
1322             info = self.extractPlus7Stream(url)
1323
1324         return [info]
1325
1326
1327 class GenericIE(InfoExtractor):
1328     """Generic last-resort information extractor."""
1329
1330     _VALID_URL = r'.*'
1331     IE_NAME = u'generic'
1332
1333     def __init__(self, downloader=None):
1334         InfoExtractor.__init__(self, downloader)
1335
1336     def report_download_webpage(self, video_id):
1337         """Report webpage download."""
1338         if not self._downloader.params.get('test', False):
1339             self._downloader.report_warning(u'Falling back on generic information extractor.')
1340         self.to_screen(u'%s: Downloading webpage' % video_id)
1341
1342     def report_following_redirect(self, new_url):
1343         """Report information extraction."""
1344         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1345
1346     def _test_redirect(self, url):
1347         """Check if it is a redirect, like url shorteners, in case return the new url."""
1348         class HeadRequest(compat_urllib_request.Request):
1349             def get_method(self):
1350                 return "HEAD"
1351
1352         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1353             """
1354             Subclass the HTTPRedirectHandler to make it use our
1355             HeadRequest also on the redirected URL
1356             """
1357             def redirect_request(self, req, fp, code, msg, headers, newurl):
1358                 if code in (301, 302, 303, 307):
1359                     newurl = newurl.replace(' ', '%20')
1360                     newheaders = dict((k,v) for k,v in req.headers.items()
1361                                       if k.lower() not in ("content-length", "content-type"))
1362                     return HeadRequest(newurl,
1363                                        headers=newheaders,
1364                                        origin_req_host=req.get_origin_req_host(),
1365                                        unverifiable=True)
1366                 else:
1367                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1368
1369         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1370             """
1371             Fallback to GET if HEAD is not allowed (405 HTTP error)
1372             """
1373             def http_error_405(self, req, fp, code, msg, headers):
1374                 fp.read()
1375                 fp.close()
1376
1377                 newheaders = dict((k,v) for k,v in req.headers.items()
1378                                   if k.lower() not in ("content-length", "content-type"))
1379                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1380                                                  headers=newheaders,
1381                                                  origin_req_host=req.get_origin_req_host(),
1382                                                  unverifiable=True))
1383
1384         # Build our opener
1385         opener = compat_urllib_request.OpenerDirector()
1386         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1387                         HTTPMethodFallback, HEADRedirectHandler,
1388                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1389             opener.add_handler(handler())
1390
1391         response = opener.open(HeadRequest(url))
1392         new_url = response.geturl()
1393
1394         if url == new_url:
1395             return False
1396
1397         self.report_following_redirect(new_url)
1398         return new_url
1399
1400     def _real_extract(self, url):
1401         new_url = self._test_redirect(url)
1402         if new_url: return [self.url_result(new_url)]
1403
1404         video_id = url.split('/')[-1]
1405         try:
1406             webpage = self._download_webpage(url, video_id)
1407         except ValueError as err:
1408             # since this is the last-resort InfoExtractor, if
1409             # this error is thrown, it'll be thrown here
1410             self._downloader.report_error(u'Invalid URL: %s' % url)
1411             return
1412
1413         self.report_extraction(video_id)
1414         # Start with something easy: JW Player in SWFObject
1415         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1416         if mobj is None:
1417             # Broaden the search a little bit
1418             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1419         if mobj is None:
1420             # Broaden the search a little bit: JWPlayer JS loader
1421             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1422         if mobj is None:
1423             self._downloader.report_error(u'Invalid URL: %s' % url)
1424             return
1425
1426         # It's possible that one of the regexes
1427         # matched, but returned an empty group:
1428         if mobj.group(1) is None:
1429             self._downloader.report_error(u'Invalid URL: %s' % url)
1430             return
1431
1432         video_url = compat_urllib_parse.unquote(mobj.group(1))
1433         video_id = os.path.basename(video_url)
1434
1435         # here's a fun little line of code for you:
1436         video_extension = os.path.splitext(video_id)[1][1:]
1437         video_id = os.path.splitext(video_id)[0]
1438
1439         # it's tempting to parse this further, but you would
1440         # have to take into account all the variations like
1441         #   Video Title - Site Name
1442         #   Site Name | Video Title
1443         #   Video Title - Tagline | Site Name
1444         # and so on and so forth; it's just not practical
1445         mobj = re.search(r'<title>(.*)</title>', webpage)
1446         if mobj is None:
1447             self._downloader.report_error(u'unable to extract title')
1448             return
1449         video_title = mobj.group(1)
1450
1451         # video uploader is domain name
1452         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1453         if mobj is None:
1454             self._downloader.report_error(u'unable to extract title')
1455             return
1456         video_uploader = mobj.group(1)
1457
1458         return [{
1459             'id':       video_id,
1460             'url':      video_url,
1461             'uploader': video_uploader,
1462             'upload_date':  None,
1463             'title':    video_title,
1464             'ext':      video_extension,
1465         }]
1466
1467
1468 class YoutubeSearchIE(InfoExtractor):
1469     """Information Extractor for YouTube search queries."""
1470     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1471     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1472     _max_youtube_results = 1000
1473     IE_NAME = u'youtube:search'
1474
1475     def __init__(self, downloader=None):
1476         InfoExtractor.__init__(self, downloader)
1477
1478     def report_download_page(self, query, pagenum):
1479         """Report attempt to download search page with given number."""
1480         query = query.decode(preferredencoding())
1481         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1482
1483     def _real_extract(self, query):
1484         mobj = re.match(self._VALID_URL, query)
1485         if mobj is None:
1486             self._downloader.report_error(u'invalid search query "%s"' % query)
1487             return
1488
1489         prefix, query = query.split(':')
1490         prefix = prefix[8:]
1491         query = query.encode('utf-8')
1492         if prefix == '':
1493             return self._get_n_results(query, 1)
1494         elif prefix == 'all':
1495             self._get_n_results(query, self._max_youtube_results)
1496         else:
1497             try:
1498                 n = int(prefix)
1499                 if n <= 0:
1500                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1501                     return
1502                 elif n > self._max_youtube_results:
1503                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1504                     n = self._max_youtube_results
1505                 return self._get_n_results(query, n)
1506             except ValueError: # parsing prefix as integer fails
1507                 return self._get_n_results(query, 1)
1508
1509     def _get_n_results(self, query, n):
1510         """Get a specified number of results for a query"""
1511
1512         video_ids = []
1513         pagenum = 0
1514         limit = n
1515
1516         while (50 * pagenum) < limit:
1517             self.report_download_page(query, pagenum+1)
1518             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1519             request = compat_urllib_request.Request(result_url)
1520             try:
1521                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1522             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1523                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1524                 return
1525             api_response = json.loads(data)['data']
1526
1527             if not 'items' in api_response:
1528                 self._downloader.trouble(u'[youtube] No video results')
1529                 return
1530
1531             new_ids = list(video['id'] for video in api_response['items'])
1532             video_ids += new_ids
1533
1534             limit = min(n, api_response['totalItems'])
1535             pagenum += 1
1536
1537         if len(video_ids) > n:
1538             video_ids = video_ids[:n]
1539         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1540         return videos
1541
1542
1543 class GoogleSearchIE(InfoExtractor):
1544     """Information Extractor for Google Video search queries."""
1545     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1546     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1547     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1548     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1549     _max_google_results = 1000
1550     IE_NAME = u'video.google:search'
1551
1552     def __init__(self, downloader=None):
1553         InfoExtractor.__init__(self, downloader)
1554
1555     def report_download_page(self, query, pagenum):
1556         """Report attempt to download playlist page with given number."""
1557         query = query.decode(preferredencoding())
1558         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1559
1560     def _real_extract(self, query):
1561         mobj = re.match(self._VALID_URL, query)
1562         if mobj is None:
1563             self._downloader.report_error(u'invalid search query "%s"' % query)
1564             return
1565
1566         prefix, query = query.split(':')
1567         prefix = prefix[8:]
1568         query = query.encode('utf-8')
1569         if prefix == '':
1570             self._download_n_results(query, 1)
1571             return
1572         elif prefix == 'all':
1573             self._download_n_results(query, self._max_google_results)
1574             return
1575         else:
1576             try:
1577                 n = int(prefix)
1578                 if n <= 0:
1579                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1580                     return
1581                 elif n > self._max_google_results:
1582                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1583                     n = self._max_google_results
1584                 self._download_n_results(query, n)
1585                 return
1586             except ValueError: # parsing prefix as integer fails
1587                 self._download_n_results(query, 1)
1588                 return
1589
1590     def _download_n_results(self, query, n):
1591         """Downloads a specified number of results for a query"""
1592
1593         video_ids = []
1594         pagenum = 0
1595
1596         while True:
1597             self.report_download_page(query, pagenum)
1598             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1599             request = compat_urllib_request.Request(result_url)
1600             try:
1601                 page = compat_urllib_request.urlopen(request).read()
1602             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1603                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1604                 return
1605
1606             # Extract video identifiers
1607             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1608                 video_id = mobj.group(1)
1609                 if video_id not in video_ids:
1610                     video_ids.append(video_id)
1611                     if len(video_ids) == n:
1612                         # Specified n videos reached
1613                         for id in video_ids:
1614                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1615                         return
1616
1617             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1618                 for id in video_ids:
1619                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1620                 return
1621
1622             pagenum = pagenum + 1
1623
1624
1625 class YahooSearchIE(InfoExtractor):
1626     """Information Extractor for Yahoo! Video search queries."""
1627
1628     _WORKING = False
1629     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1630     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1631     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1632     _MORE_PAGES_INDICATOR = r'\s*Next'
1633     _max_yahoo_results = 1000
1634     IE_NAME = u'video.yahoo:search'
1635
1636     def __init__(self, downloader=None):
1637         InfoExtractor.__init__(self, downloader)
1638
1639     def report_download_page(self, query, pagenum):
1640         """Report attempt to download playlist page with given number."""
1641         query = query.decode(preferredencoding())
1642         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1643
1644     def _real_extract(self, query):
1645         mobj = re.match(self._VALID_URL, query)
1646         if mobj is None:
1647             self._downloader.report_error(u'invalid search query "%s"' % query)
1648             return
1649
1650         prefix, query = query.split(':')
1651         prefix = prefix[8:]
1652         query = query.encode('utf-8')
1653         if prefix == '':
1654             self._download_n_results(query, 1)
1655             return
1656         elif prefix == 'all':
1657             self._download_n_results(query, self._max_yahoo_results)
1658             return
1659         else:
1660             try:
1661                 n = int(prefix)
1662                 if n <= 0:
1663                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1664                     return
1665                 elif n > self._max_yahoo_results:
1666                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1667                     n = self._max_yahoo_results
1668                 self._download_n_results(query, n)
1669                 return
1670             except ValueError: # parsing prefix as integer fails
1671                 self._download_n_results(query, 1)
1672                 return
1673
1674     def _download_n_results(self, query, n):
1675         """Downloads a specified number of results for a query"""
1676
1677         video_ids = []
1678         already_seen = set()
1679         pagenum = 1
1680
1681         while True:
1682             self.report_download_page(query, pagenum)
1683             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1684             request = compat_urllib_request.Request(result_url)
1685             try:
1686                 page = compat_urllib_request.urlopen(request).read()
1687             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1688                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1689                 return
1690
1691             # Extract video identifiers
1692             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1693                 video_id = mobj.group(1)
1694                 if video_id not in already_seen:
1695                     video_ids.append(video_id)
1696                     already_seen.add(video_id)
1697                     if len(video_ids) == n:
1698                         # Specified n videos reached
1699                         for id in video_ids:
1700                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1701                         return
1702
1703             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1704                 for id in video_ids:
1705                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1706                 return
1707
1708             pagenum = pagenum + 1
1709
1710
1711 class YoutubePlaylistIE(InfoExtractor):
1712     """Information Extractor for YouTube playlists."""
1713
1714     _VALID_URL = r"""(?:
1715                         (?:https?://)?
1716                         (?:\w+\.)?
1717                         youtube\.com/
1718                         (?:
1719                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1720                            \? (?:.*?&)*? (?:p|a|list)=
1721                         |  p/
1722                         )
1723                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1724                         .*
1725                      |
1726                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1727                      )"""
1728     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1729     _MAX_RESULTS = 50
1730     IE_NAME = u'youtube:playlist'
1731
1732     def __init__(self, downloader=None):
1733         InfoExtractor.__init__(self, downloader)
1734
1735     @classmethod
1736     def suitable(cls, url):
1737         """Receives a URL and returns True if suitable for this IE."""
1738         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1739
1740     def report_download_page(self, playlist_id, pagenum):
1741         """Report attempt to download playlist page with given number."""
1742         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1743
1744     def _real_extract(self, url):
1745         # Extract playlist id
1746         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1747         if mobj is None:
1748             self._downloader.report_error(u'invalid url: %s' % url)
1749             return
1750
1751         # Download playlist videos from API
1752         playlist_id = mobj.group(1) or mobj.group(2)
1753         page_num = 1
1754         videos = []
1755
1756         while True:
1757             self.report_download_page(playlist_id, page_num)
1758
1759             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1760             try:
1761                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1762             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1763                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1764                 return
1765
1766             try:
1767                 response = json.loads(page)
1768             except ValueError as err:
1769                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1770                 return
1771
1772             if 'feed' not in response:
1773                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1774                 return
1775             if 'entry' not in response['feed']:
1776                 # Number of videos is a multiple of self._MAX_RESULTS
1777                 break
1778
1779             playlist_title = response['feed']['title']['$t']
1780
1781             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1782                         for entry in response['feed']['entry']
1783                         if 'content' in entry ]
1784
1785             if len(response['feed']['entry']) < self._MAX_RESULTS:
1786                 break
1787             page_num += 1
1788
1789         videos = [v[1] for v in sorted(videos)]
1790
1791         url_results = [self.url_result(url, 'Youtube') for url in videos]
1792         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1793
1794
1795 class YoutubeChannelIE(InfoExtractor):
1796     """Information Extractor for YouTube channels."""
1797
1798     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1799     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1800     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1801     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1802     IE_NAME = u'youtube:channel'
1803
1804     def report_download_page(self, channel_id, pagenum):
1805         """Report attempt to download channel page with given number."""
1806         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1807
1808     def extract_videos_from_page(self, page):
1809         ids_in_page = []
1810         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1811             if mobj.group(1) not in ids_in_page:
1812                 ids_in_page.append(mobj.group(1))
1813         return ids_in_page
1814
1815     def _real_extract(self, url):
1816         # Extract channel id
1817         mobj = re.match(self._VALID_URL, url)
1818         if mobj is None:
1819             self._downloader.report_error(u'invalid url: %s' % url)
1820             return
1821
1822         # Download channel page
1823         channel_id = mobj.group(1)
1824         video_ids = []
1825         pagenum = 1
1826
1827         self.report_download_page(channel_id, pagenum)
1828         url = self._TEMPLATE_URL % (channel_id, pagenum)
1829         request = compat_urllib_request.Request(url)
1830         try:
1831             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1832         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1833             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1834             return
1835
1836         # Extract video identifiers
1837         ids_in_page = self.extract_videos_from_page(page)
1838         video_ids.extend(ids_in_page)
1839
1840         # Download any subsequent channel pages using the json-based channel_ajax query
1841         if self._MORE_PAGES_INDICATOR in page:
1842             while True:
1843                 pagenum = pagenum + 1
1844
1845                 self.report_download_page(channel_id, pagenum)
1846                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1847                 request = compat_urllib_request.Request(url)
1848                 try:
1849                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1850                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1852                     return
1853
1854                 page = json.loads(page)
1855
1856                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1857                 video_ids.extend(ids_in_page)
1858
1859                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1860                     break
1861
1862         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1863
1864         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1865         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1866         return [self.playlist_result(url_entries, channel_id)]
1867
1868
1869 class YoutubeUserIE(InfoExtractor):
1870     """Information Extractor for YouTube users."""
1871
1872     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1873     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1874     _GDATA_PAGE_SIZE = 50
1875     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1876     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1877     IE_NAME = u'youtube:user'
1878
1879     def __init__(self, downloader=None):
1880         InfoExtractor.__init__(self, downloader)
1881
1882     def report_download_page(self, username, start_index):
1883         """Report attempt to download user page."""
1884         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1885                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1886
1887     def _real_extract(self, url):
1888         # Extract username
1889         mobj = re.match(self._VALID_URL, url)
1890         if mobj is None:
1891             self._downloader.report_error(u'invalid url: %s' % url)
1892             return
1893
1894         username = mobj.group(1)
1895
1896         # Download video ids using YouTube Data API. Result size per
1897         # query is limited (currently to 50 videos) so we need to query
1898         # page by page until there are no video ids - it means we got
1899         # all of them.
1900
1901         video_ids = []
1902         pagenum = 0
1903
1904         while True:
1905             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1906             self.report_download_page(username, start_index)
1907
1908             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1909
1910             try:
1911                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1912             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1913                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1914                 return
1915
1916             # Extract video identifiers
1917             ids_in_page = []
1918
1919             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1920                 if mobj.group(1) not in ids_in_page:
1921                     ids_in_page.append(mobj.group(1))
1922
1923             video_ids.extend(ids_in_page)
1924
1925             # A little optimization - if current page is not
1926             # "full", ie. does not contain PAGE_SIZE video ids then
1927             # we can assume that this page is the last one - there
1928             # are no more ids on further pages - no need to query
1929             # again.
1930
1931             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1932                 break
1933
1934             pagenum += 1
1935
1936         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1937         url_results = [self.url_result(url, 'Youtube') for url in urls]
1938         return [self.playlist_result(url_results, playlist_title = username)]
1939
1940
1941 class BlipTVUserIE(InfoExtractor):
1942     """Information Extractor for blip.tv users."""
1943
1944     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1945     _PAGE_SIZE = 12
1946     IE_NAME = u'blip.tv:user'
1947
1948     def __init__(self, downloader=None):
1949         InfoExtractor.__init__(self, downloader)
1950
1951     def report_download_page(self, username, pagenum):
1952         """Report attempt to download user page."""
1953         self.to_screen(u'user %s: Downloading video ids from page %d' %
1954                 (username, pagenum))
1955
1956     def _real_extract(self, url):
1957         # Extract username
1958         mobj = re.match(self._VALID_URL, url)
1959         if mobj is None:
1960             self._downloader.report_error(u'invalid url: %s' % url)
1961             return
1962
1963         username = mobj.group(1)
1964
1965         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1966
1967         request = compat_urllib_request.Request(url)
1968
1969         try:
1970             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1971             mobj = re.search(r'data-users-id="([^"]+)"', page)
1972             page_base = page_base % mobj.group(1)
1973         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1974             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1975             return
1976
1977
1978         # Download video ids using BlipTV Ajax calls. Result size per
1979         # query is limited (currently to 12 videos) so we need to query
1980         # page by page until there are no video ids - it means we got
1981         # all of them.
1982
1983         video_ids = []
1984         pagenum = 1
1985
1986         while True:
1987             self.report_download_page(username, pagenum)
1988             url = page_base + "&page=" + str(pagenum)
1989             request = compat_urllib_request.Request( url )
1990             try:
1991                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1992             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1993                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1994                 return
1995
1996             # Extract video identifiers
1997             ids_in_page = []
1998
1999             for mobj in re.finditer(r'href="/([^"]+)"', page):
2000                 if mobj.group(1) not in ids_in_page:
2001                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2002
2003             video_ids.extend(ids_in_page)
2004
2005             # A little optimization - if current page is not
2006             # "full", ie. does not contain PAGE_SIZE video ids then
2007             # we can assume that this page is the last one - there
2008             # are no more ids on further pages - no need to query
2009             # again.
2010
2011             if len(ids_in_page) < self._PAGE_SIZE:
2012                 break
2013
2014             pagenum += 1
2015
2016         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2017         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2018         return [self.playlist_result(url_entries, playlist_title = username)]
2019
2020
2021 class DepositFilesIE(InfoExtractor):
2022     """Information extractor for depositfiles.com"""
2023
2024     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2025
2026     def report_download_webpage(self, file_id):
2027         """Report webpage download."""
2028         self.to_screen(u'%s: Downloading webpage' % file_id)
2029
2030     def _real_extract(self, url):
2031         file_id = url.split('/')[-1]
2032         # Rebuild url in english locale
2033         url = 'http://depositfiles.com/en/files/' + file_id
2034
2035         # Retrieve file webpage with 'Free download' button pressed
2036         free_download_indication = { 'gateway_result' : '1' }
2037         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2038         try:
2039             self.report_download_webpage(file_id)
2040             webpage = compat_urllib_request.urlopen(request).read()
2041         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2042             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2043             return
2044
2045         # Search for the real file URL
2046         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2047         if (mobj is None) or (mobj.group(1) is None):
2048             # Try to figure out reason of the error.
2049             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2050             if (mobj is not None) and (mobj.group(1) is not None):
2051                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2052                 self._downloader.report_error(u'%s' % restriction_message)
2053             else:
2054                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2055             return
2056
2057         file_url = mobj.group(1)
2058         file_extension = os.path.splitext(file_url)[1][1:]
2059
2060         # Search for file title
2061         mobj = re.search(r'<b title="(.*?)">', webpage)
2062         if mobj is None:
2063             self._downloader.report_error(u'unable to extract title')
2064             return
2065         file_title = mobj.group(1).decode('utf-8')
2066
2067         return [{
2068             'id':       file_id.decode('utf-8'),
2069             'url':      file_url.decode('utf-8'),
2070             'uploader': None,
2071             'upload_date':  None,
2072             'title':    file_title,
2073             'ext':      file_extension.decode('utf-8'),
2074         }]
2075
2076
2077 class FacebookIE(InfoExtractor):
2078     """Information Extractor for Facebook"""
2079
2080     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2081     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2082     _NETRC_MACHINE = 'facebook'
2083     IE_NAME = u'facebook'
2084
2085     def report_login(self):
2086         """Report attempt to log in."""
2087         self.to_screen(u'Logging in')
2088
2089     def _real_initialize(self):
2090         if self._downloader is None:
2091             return
2092
2093         useremail = None
2094         password = None
2095         downloader_params = self._downloader.params
2096
2097         # Attempt to use provided username and password or .netrc data
2098         if downloader_params.get('username', None) is not None:
2099             useremail = downloader_params['username']
2100             password = downloader_params['password']
2101         elif downloader_params.get('usenetrc', False):
2102             try:
2103                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2104                 if info is not None:
2105                     useremail = info[0]
2106                     password = info[2]
2107                 else:
2108                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2109             except (IOError, netrc.NetrcParseError) as err:
2110                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2111                 return
2112
2113         if useremail is None:
2114             return
2115
2116         # Log in
2117         login_form = {
2118             'email': useremail,
2119             'pass': password,
2120             'login': 'Log+In'
2121             }
2122         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2123         try:
2124             self.report_login()
2125             login_results = compat_urllib_request.urlopen(request).read()
2126             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2127                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2128                 return
2129         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2130             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2131             return
2132
2133     def _real_extract(self, url):
2134         mobj = re.match(self._VALID_URL, url)
2135         if mobj is None:
2136             self._downloader.report_error(u'invalid URL: %s' % url)
2137             return
2138         video_id = mobj.group('ID')
2139
2140         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2141         webpage = self._download_webpage(url, video_id)
2142
2143         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2144         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2145         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2146         if not m:
2147             raise ExtractorError(u'Cannot parse data')
2148         data = dict(json.loads(m.group(1)))
2149         params_raw = compat_urllib_parse.unquote(data['params'])
2150         params = json.loads(params_raw)
2151         video_data = params['video_data'][0]
2152         video_url = video_data.get('hd_src')
2153         if not video_url:
2154             video_url = video_data['sd_src']
2155         if not video_url:
2156             raise ExtractorError(u'Cannot find video URL')
2157         video_duration = int(video_data['video_duration'])
2158         thumbnail = video_data['thumbnail_src']
2159
2160         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2161         if not m:
2162             raise ExtractorError(u'Cannot find title in webpage')
2163         video_title = unescapeHTML(m.group(1))
2164
2165         info = {
2166             'id': video_id,
2167             'title': video_title,
2168             'url': video_url,
2169             'ext': 'mp4',
2170             'duration': video_duration,
2171             'thumbnail': thumbnail,
2172         }
2173         return [info]
2174
2175
2176 class BlipTVIE(InfoExtractor):
2177     """Information extractor for blip.tv"""
2178
2179     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2180     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2181     IE_NAME = u'blip.tv'
2182
2183     def report_direct_download(self, title):
2184         """Report information extraction."""
2185         self.to_screen(u'%s: Direct download detected' % title)
2186
2187     def _real_extract(self, url):
2188         mobj = re.match(self._VALID_URL, url)
2189         if mobj is None:
2190             self._downloader.report_error(u'invalid URL: %s' % url)
2191             return
2192
2193         urlp = compat_urllib_parse_urlparse(url)
2194         if urlp.path.startswith('/play/'):
2195             request = compat_urllib_request.Request(url)
2196             response = compat_urllib_request.urlopen(request)
2197             redirecturl = response.geturl()
2198             rurlp = compat_urllib_parse_urlparse(redirecturl)
2199             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2200             url = 'http://blip.tv/a/a-' + file_id
2201             return self._real_extract(url)
2202
2203
2204         if '?' in url:
2205             cchar = '&'
2206         else:
2207             cchar = '?'
2208         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2209         request = compat_urllib_request.Request(json_url)
2210         request.add_header('User-Agent', 'iTunes/10.6.1')
2211         self.report_extraction(mobj.group(1))
2212         info = None
2213         try:
2214             urlh = compat_urllib_request.urlopen(request)
2215             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2216                 basename = url.split('/')[-1]
2217                 title,ext = os.path.splitext(basename)
2218                 title = title.decode('UTF-8')
2219                 ext = ext.replace('.', '')
2220                 self.report_direct_download(title)
2221                 info = {
2222                     'id': title,
2223                     'url': url,
2224                     'uploader': None,
2225                     'upload_date': None,
2226                     'title': title,
2227                     'ext': ext,
2228                     'urlhandle': urlh
2229                 }
2230         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2232         if info is None: # Regular URL
2233             try:
2234                 json_code_bytes = urlh.read()
2235                 json_code = json_code_bytes.decode('utf-8')
2236             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2238                 return
2239
2240             try:
2241                 json_data = json.loads(json_code)
2242                 if 'Post' in json_data:
2243                     data = json_data['Post']
2244                 else:
2245                     data = json_data
2246
2247                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2248                 video_url = data['media']['url']
2249                 umobj = re.match(self._URL_EXT, video_url)
2250                 if umobj is None:
2251                     raise ValueError('Can not determine filename extension')
2252                 ext = umobj.group(1)
2253
2254                 info = {
2255                     'id': data['item_id'],
2256                     'url': video_url,
2257                     'uploader': data['display_name'],
2258                     'upload_date': upload_date,
2259                     'title': data['title'],
2260                     'ext': ext,
2261                     'format': data['media']['mimeType'],
2262                     'thumbnail': data['thumbnailUrl'],
2263                     'description': data['description'],
2264                     'player_url': data['embedUrl'],
2265                     'user_agent': 'iTunes/10.6.1',
2266                 }
2267             except (ValueError,KeyError) as err:
2268                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2269                 return
2270
2271         return [info]
2272
2273
2274 class MyVideoIE(InfoExtractor):
2275     """Information Extractor for myvideo.de."""
2276
2277     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2278     IE_NAME = u'myvideo'
2279
2280     def __init__(self, downloader=None):
2281         InfoExtractor.__init__(self, downloader)
2282
2283     def _real_extract(self,url):
2284         mobj = re.match(self._VALID_URL, url)
2285         if mobj is None:
2286             self._download.report_error(u'invalid URL: %s' % url)
2287             return
2288
2289         video_id = mobj.group(1)
2290
2291         # Get video webpage
2292         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2293         webpage = self._download_webpage(webpage_url, video_id)
2294
2295         self.report_extraction(video_id)
2296         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2297                  webpage)
2298         if mobj is None:
2299             self._downloader.report_error(u'unable to extract media URL')
2300             return
2301         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2302
2303         mobj = re.search('<title>([^<]+)</title>', webpage)
2304         if mobj is None:
2305             self._downloader.report_error(u'unable to extract title')
2306             return
2307
2308         video_title = mobj.group(1)
2309
2310         return [{
2311             'id':       video_id,
2312             'url':      video_url,
2313             'uploader': None,
2314             'upload_date':  None,
2315             'title':    video_title,
2316             'ext':      u'flv',
2317         }]
2318
2319 class ComedyCentralIE(InfoExtractor):
2320     """Information extractor for The Daily Show and Colbert Report """
2321
2322     # urls can be abbreviations like :thedailyshow or :colbert
2323     # urls for episodes like:
2324     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2325     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2326     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2327     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2328                       |(https?://)?(www\.)?
2329                           (?P<showname>thedailyshow|colbertnation)\.com/
2330                          (full-episodes/(?P<episode>.*)|
2331                           (?P<clip>
2332                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2333                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2334                      $"""
2335
2336     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2337
2338     _video_extensions = {
2339         '3500': 'mp4',
2340         '2200': 'mp4',
2341         '1700': 'mp4',
2342         '1200': 'mp4',
2343         '750': 'mp4',
2344         '400': 'mp4',
2345     }
2346     _video_dimensions = {
2347         '3500': '1280x720',
2348         '2200': '960x540',
2349         '1700': '768x432',
2350         '1200': '640x360',
2351         '750': '512x288',
2352         '400': '384x216',
2353     }
2354
2355     @classmethod
2356     def suitable(cls, url):
2357         """Receives a URL and returns True if suitable for this IE."""
2358         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2359
2360     def report_config_download(self, episode_id, media_id):
2361         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2362
2363     def report_index_download(self, episode_id):
2364         self.to_screen(u'%s: Downloading show index' % episode_id)
2365
2366     def _print_formats(self, formats):
2367         print('Available formats:')
2368         for x in formats:
2369             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2370
2371
2372     def _real_extract(self, url):
2373         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2374         if mobj is None:
2375             self._downloader.report_error(u'invalid URL: %s' % url)
2376             return
2377
2378         if mobj.group('shortname'):
2379             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2380                 url = u'http://www.thedailyshow.com/full-episodes/'
2381             else:
2382                 url = u'http://www.colbertnation.com/full-episodes/'
2383             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2384             assert mobj is not None
2385
2386         if mobj.group('clip'):
2387             if mobj.group('showname') == 'thedailyshow':
2388                 epTitle = mobj.group('tdstitle')
2389             else:
2390                 epTitle = mobj.group('cntitle')
2391             dlNewest = False
2392         else:
2393             dlNewest = not mobj.group('episode')
2394             if dlNewest:
2395                 epTitle = mobj.group('showname')
2396             else:
2397                 epTitle = mobj.group('episode')
2398
2399         req = compat_urllib_request.Request(url)
2400         self.report_extraction(epTitle)
2401         try:
2402             htmlHandle = compat_urllib_request.urlopen(req)
2403             html = htmlHandle.read()
2404             webpage = html.decode('utf-8')
2405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2407             return
2408         if dlNewest:
2409             url = htmlHandle.geturl()
2410             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2411             if mobj is None:
2412                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2413                 return
2414             if mobj.group('episode') == '':
2415                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2416                 return
2417             epTitle = mobj.group('episode')
2418
2419         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2420
2421         if len(mMovieParams) == 0:
2422             # The Colbert Report embeds the information in a without
2423             # a URL prefix; so extract the alternate reference
2424             # and then add the URL prefix manually.
2425
2426             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2427             if len(altMovieParams) == 0:
2428                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2429                 return
2430             else:
2431                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2432
2433         uri = mMovieParams[0][1]
2434         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2435         self.report_index_download(epTitle)
2436         try:
2437             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2438         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2439             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2440             return
2441
2442         results = []
2443
2444         idoc = xml.etree.ElementTree.fromstring(indexXml)
2445         itemEls = idoc.findall('.//item')
2446         for partNum,itemEl in enumerate(itemEls):
2447             mediaId = itemEl.findall('./guid')[0].text
2448             shortMediaId = mediaId.split(':')[-1]
2449             showId = mediaId.split(':')[-2].replace('.com', '')
2450             officialTitle = itemEl.findall('./title')[0].text
2451             officialDate = itemEl.findall('./pubDate')[0].text
2452
2453             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2454                         compat_urllib_parse.urlencode({'uri': mediaId}))
2455             configReq = compat_urllib_request.Request(configUrl)
2456             self.report_config_download(epTitle, shortMediaId)
2457             try:
2458                 configXml = compat_urllib_request.urlopen(configReq).read()
2459             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2460                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2461                 return
2462
2463             cdoc = xml.etree.ElementTree.fromstring(configXml)
2464             turls = []
2465             for rendition in cdoc.findall('.//rendition'):
2466                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2467                 turls.append(finfo)
2468
2469             if len(turls) == 0:
2470                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2471                 continue
2472
2473             if self._downloader.params.get('listformats', None):
2474                 self._print_formats([i[0] for i in turls])
2475                 return
2476
2477             # For now, just pick the highest bitrate
2478             format,rtmp_video_url = turls[-1]
2479
2480             # Get the format arg from the arg stream
2481             req_format = self._downloader.params.get('format', None)
2482
2483             # Select format if we can find one
2484             for f,v in turls:
2485                 if f == req_format:
2486                     format, rtmp_video_url = f, v
2487                     break
2488
2489             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2490             if not m:
2491                 raise ExtractorError(u'Cannot transform RTMP url')
2492             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2493             video_url = base + m.group('finalid')
2494
2495             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2496             info = {
2497                 'id': shortMediaId,
2498                 'url': video_url,
2499                 'uploader': showId,
2500                 'upload_date': officialDate,
2501                 'title': effTitle,
2502                 'ext': 'mp4',
2503                 'format': format,
2504                 'thumbnail': None,
2505                 'description': officialTitle,
2506             }
2507             results.append(info)
2508
2509         return results
2510
2511
2512 class EscapistIE(InfoExtractor):
2513     """Information extractor for The Escapist """
2514
2515     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2516     IE_NAME = u'escapist'
2517
2518     def report_config_download(self, showName):
2519         self.to_screen(u'%s: Downloading configuration' % showName)
2520
2521     def _real_extract(self, url):
2522         mobj = re.match(self._VALID_URL, url)
2523         if mobj is None:
2524             self._downloader.report_error(u'invalid URL: %s' % url)
2525             return
2526         showName = mobj.group('showname')
2527         videoId = mobj.group('episode')
2528
2529         self.report_extraction(showName)
2530         try:
2531             webPage = compat_urllib_request.urlopen(url)
2532             webPageBytes = webPage.read()
2533             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2534             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2535         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2536             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2537             return
2538
2539         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2540         description = unescapeHTML(descMatch.group(1))
2541         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2542         imgUrl = unescapeHTML(imgMatch.group(1))
2543         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2544         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2545         configUrlMatch = re.search('config=(.*)$', playerUrl)
2546         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2547
2548         self.report_config_download(showName)
2549         try:
2550             configJSON = compat_urllib_request.urlopen(configUrl)
2551             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2552             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2553         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2555             return
2556
2557         # Technically, it's JavaScript, not JSON
2558         configJSON = configJSON.replace("'", '"')
2559
2560         try:
2561             config = json.loads(configJSON)
2562         except (ValueError,) as err:
2563             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2564             return
2565
2566         playlist = config['playlist']
2567         videoUrl = playlist[1]['url']
2568
2569         info = {
2570             'id': videoId,
2571             'url': videoUrl,
2572             'uploader': showName,
2573             'upload_date': None,
2574             'title': showName,
2575             'ext': 'mp4',
2576             'thumbnail': imgUrl,
2577             'description': description,
2578             'player_url': playerUrl,
2579         }
2580
2581         return [info]
2582
2583 class CollegeHumorIE(InfoExtractor):
2584     """Information extractor for collegehumor.com"""
2585
2586     _WORKING = False
2587     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2588     IE_NAME = u'collegehumor'
2589
2590     def report_manifest(self, video_id):
2591         """Report information extraction."""
2592         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2593
2594     def _real_extract(self, url):
2595         mobj = re.match(self._VALID_URL, url)
2596         if mobj is None:
2597             self._downloader.report_error(u'invalid URL: %s' % url)
2598             return
2599         video_id = mobj.group('videoid')
2600
2601         info = {
2602             'id': video_id,
2603             'uploader': None,
2604             'upload_date': None,
2605         }
2606
2607         self.report_extraction(video_id)
2608         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2609         try:
2610             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2611         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2612             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2613             return
2614
2615         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2616         try:
2617             videoNode = mdoc.findall('./video')[0]
2618             info['description'] = videoNode.findall('./description')[0].text
2619             info['title'] = videoNode.findall('./caption')[0].text
2620             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2621             manifest_url = videoNode.findall('./file')[0].text
2622         except IndexError:
2623             self._downloader.report_error(u'Invalid metadata XML file')
2624             return
2625
2626         manifest_url += '?hdcore=2.10.3'
2627         self.report_manifest(video_id)
2628         try:
2629             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2630         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2631             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2632             return
2633
2634         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2635         try:
2636             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2637             node_id = media_node.attrib['url']
2638             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2639         except IndexError as err:
2640             self._downloader.report_error(u'Invalid manifest file')
2641             return
2642
2643         url_pr = compat_urllib_parse_urlparse(manifest_url)
2644         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2645
2646         info['url'] = url
2647         info['ext'] = 'f4f'
2648         return [info]
2649
2650
2651 class XVideosIE(InfoExtractor):
2652     """Information extractor for xvideos.com"""
2653
2654     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2655     IE_NAME = u'xvideos'
2656
2657     def _real_extract(self, url):
2658         mobj = re.match(self._VALID_URL, url)
2659         if mobj is None:
2660             self._downloader.report_error(u'invalid URL: %s' % url)
2661             return
2662         video_id = mobj.group(1)
2663
2664         webpage = self._download_webpage(url, video_id)
2665
2666         self.report_extraction(video_id)
2667
2668
2669         # Extract video URL
2670         mobj = re.search(r'flv_url=(.+?)&', webpage)
2671         if mobj is None:
2672             self._downloader.report_error(u'unable to extract video url')
2673             return
2674         video_url = compat_urllib_parse.unquote(mobj.group(1))
2675
2676
2677         # Extract title
2678         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2679         if mobj is None:
2680             self._downloader.report_error(u'unable to extract video title')
2681             return
2682         video_title = mobj.group(1)
2683
2684
2685         # Extract video thumbnail
2686         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2687         if mobj is None:
2688             self._downloader.report_error(u'unable to extract video thumbnail')
2689             return
2690         video_thumbnail = mobj.group(0)
2691
2692         info = {
2693             'id': video_id,
2694             'url': video_url,
2695             'uploader': None,
2696             'upload_date': None,
2697             'title': video_title,
2698             'ext': 'flv',
2699             'thumbnail': video_thumbnail,
2700             'description': None,
2701         }
2702
2703         return [info]
2704
2705
2706 class SoundcloudIE(InfoExtractor):
2707     """Information extractor for soundcloud.com
2708        To access the media, the uid of the song and a stream token
2709        must be extracted from the page source and the script must make
2710        a request to media.soundcloud.com/crossdomain.xml. Then
2711        the media can be grabbed by requesting from an url composed
2712        of the stream token and uid
2713      """
2714
2715     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2716     IE_NAME = u'soundcloud'
2717
2718     def __init__(self, downloader=None):
2719         InfoExtractor.__init__(self, downloader)
2720
2721     def report_resolve(self, video_id):
2722         """Report information extraction."""
2723         self.to_screen(u'%s: Resolving id' % video_id)
2724
2725     def _real_extract(self, url):
2726         mobj = re.match(self._VALID_URL, url)
2727         if mobj is None:
2728             self._downloader.report_error(u'invalid URL: %s' % url)
2729             return
2730
2731         # extract uploader (which is in the url)
2732         uploader = mobj.group(1)
2733         # extract simple title (uploader + slug of song title)
2734         slug_title =  mobj.group(2)
2735         simple_title = uploader + u'-' + slug_title
2736
2737         self.report_resolve('%s/%s' % (uploader, slug_title))
2738
2739         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2740         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741         request = compat_urllib_request.Request(resolv_url)
2742         try:
2743             info_json_bytes = compat_urllib_request.urlopen(request).read()
2744             info_json = info_json_bytes.decode('utf-8')
2745         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2747             return
2748
2749         info = json.loads(info_json)
2750         video_id = info['id']
2751         self.report_extraction('%s/%s' % (uploader, slug_title))
2752
2753         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2754         request = compat_urllib_request.Request(streams_url)
2755         try:
2756             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2757             stream_json = stream_json_bytes.decode('utf-8')
2758         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2759             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2760             return
2761
2762         streams = json.loads(stream_json)
2763         mediaURL = streams['http_mp3_128_url']
2764
2765         return [{
2766             'id':       info['id'],
2767             'url':      mediaURL,
2768             'uploader': info['user']['username'],
2769             'upload_date':  info['created_at'],
2770             'title':    info['title'],
2771             'ext':      u'mp3',
2772             'description': info['description'],
2773         }]
2774
2775 class SoundcloudSetIE(InfoExtractor):
2776     """Information extractor for soundcloud.com sets
2777        To access the media, the uid of the song and a stream token
2778        must be extracted from the page source and the script must make
2779        a request to media.soundcloud.com/crossdomain.xml. Then
2780        the media can be grabbed by requesting from an url composed
2781        of the stream token and uid
2782      """
2783
2784     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2785     IE_NAME = u'soundcloud'
2786
2787     def __init__(self, downloader=None):
2788         InfoExtractor.__init__(self, downloader)
2789
2790     def report_resolve(self, video_id):
2791         """Report information extraction."""
2792         self.to_screen(u'%s: Resolving id' % video_id)
2793
2794     def _real_extract(self, url):
2795         mobj = re.match(self._VALID_URL, url)
2796         if mobj is None:
2797             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798             return
2799
2800         # extract uploader (which is in the url)
2801         uploader = mobj.group(1)
2802         # extract simple title (uploader + slug of song title)
2803         slug_title =  mobj.group(2)
2804         simple_title = uploader + u'-' + slug_title
2805
2806         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2807
2808         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2809         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810         request = compat_urllib_request.Request(resolv_url)
2811         try:
2812             info_json_bytes = compat_urllib_request.urlopen(request).read()
2813             info_json = info_json_bytes.decode('utf-8')
2814         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2816             return
2817
2818         videos = []
2819         info = json.loads(info_json)
2820         if 'errors' in info:
2821             for err in info['errors']:
2822                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2823             return
2824
2825         for track in info['tracks']:
2826             video_id = track['id']
2827             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2828
2829             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2830             request = compat_urllib_request.Request(streams_url)
2831             try:
2832                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2833                 stream_json = stream_json_bytes.decode('utf-8')
2834             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2835                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2836                 return
2837
2838             streams = json.loads(stream_json)
2839             mediaURL = streams['http_mp3_128_url']
2840
2841             videos.append({
2842                 'id':       video_id,
2843                 'url':      mediaURL,
2844                 'uploader': track['user']['username'],
2845                 'upload_date':  track['created_at'],
2846                 'title':    track['title'],
2847                 'ext':      u'mp3',
2848                 'description': track['description'],
2849             })
2850         return videos
2851
2852
2853 class InfoQIE(InfoExtractor):
2854     """Information extractor for infoq.com"""
2855     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2856
2857     def _real_extract(self, url):
2858         mobj = re.match(self._VALID_URL, url)
2859         if mobj is None:
2860             self._downloader.report_error(u'invalid URL: %s' % url)
2861             return
2862
2863         webpage = self._download_webpage(url, video_id=url)
2864         self.report_extraction(url)
2865
2866         # Extract video URL
2867         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2868         if mobj is None:
2869             self._downloader.report_error(u'unable to extract video url')
2870             return
2871         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2872         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2873
2874         # Extract title
2875         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2876         if mobj is None:
2877             self._downloader.report_error(u'unable to extract video title')
2878             return
2879         video_title = mobj.group(1)
2880
2881         # Extract description
2882         video_description = u'No description available.'
2883         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2884         if mobj is not None:
2885             video_description = mobj.group(1)
2886
2887         video_filename = video_url.split('/')[-1]
2888         video_id, extension = video_filename.split('.')
2889
2890         info = {
2891             'id': video_id,
2892             'url': video_url,
2893             'uploader': None,
2894             'upload_date': None,
2895             'title': video_title,
2896             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2897             'thumbnail': None,
2898             'description': video_description,
2899         }
2900
2901         return [info]
2902
2903 class MixcloudIE(InfoExtractor):
2904     """Information extractor for www.mixcloud.com"""
2905
2906     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2907     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2908     IE_NAME = u'mixcloud'
2909
2910     def __init__(self, downloader=None):
2911         InfoExtractor.__init__(self, downloader)
2912
2913     def report_download_json(self, file_id):
2914         """Report JSON download."""
2915         self.to_screen(u'Downloading json')
2916
2917     def get_urls(self, jsonData, fmt, bitrate='best'):
2918         """Get urls from 'audio_formats' section in json"""
2919         file_url = None
2920         try:
2921             bitrate_list = jsonData[fmt]
2922             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2923                 bitrate = max(bitrate_list) # select highest
2924
2925             url_list = jsonData[fmt][bitrate]
2926         except TypeError: # we have no bitrate info.
2927             url_list = jsonData[fmt]
2928         return url_list
2929
2930     def check_urls(self, url_list):
2931         """Returns 1st active url from list"""
2932         for url in url_list:
2933             try:
2934                 compat_urllib_request.urlopen(url)
2935                 return url
2936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2937                 url = None
2938
2939         return None
2940
2941     def _print_formats(self, formats):
2942         print('Available formats:')
2943         for fmt in formats.keys():
2944             for b in formats[fmt]:
2945                 try:
2946                     ext = formats[fmt][b][0]
2947                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2948                 except TypeError: # we have no bitrate info
2949                     ext = formats[fmt][0]
2950                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2951                     break
2952
2953     def _real_extract(self, url):
2954         mobj = re.match(self._VALID_URL, url)
2955         if mobj is None:
2956             self._downloader.report_error(u'invalid URL: %s' % url)
2957             return
2958         # extract uploader & filename from url
2959         uploader = mobj.group(1).decode('utf-8')
2960         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2961
2962         # construct API request
2963         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2964         # retrieve .json file with links to files
2965         request = compat_urllib_request.Request(file_url)
2966         try:
2967             self.report_download_json(file_url)
2968             jsonData = compat_urllib_request.urlopen(request).read()
2969         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2970             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2971             return
2972
2973         # parse JSON
2974         json_data = json.loads(jsonData)
2975         player_url = json_data['player_swf_url']
2976         formats = dict(json_data['audio_formats'])
2977
2978         req_format = self._downloader.params.get('format', None)
2979         bitrate = None
2980
2981         if self._downloader.params.get('listformats', None):
2982             self._print_formats(formats)
2983             return
2984
2985         if req_format is None or req_format == 'best':
2986             for format_param in formats.keys():
2987                 url_list = self.get_urls(formats, format_param)
2988                 # check urls
2989                 file_url = self.check_urls(url_list)
2990                 if file_url is not None:
2991                     break # got it!
2992         else:
2993             if req_format not in formats:
2994                 self._downloader.report_error(u'format is not available')
2995                 return
2996
2997             url_list = self.get_urls(formats, req_format)
2998             file_url = self.check_urls(url_list)
2999             format_param = req_format
3000
3001         return [{
3002             'id': file_id.decode('utf-8'),
3003             'url': file_url.decode('utf-8'),
3004             'uploader': uploader.decode('utf-8'),
3005             'upload_date': None,
3006             'title': json_data['name'],
3007             'ext': file_url.split('.')[-1].decode('utf-8'),
3008             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3009             'thumbnail': json_data['thumbnail_url'],
3010             'description': json_data['description'],
3011             'player_url': player_url.decode('utf-8'),
3012         }]
3013
3014 class StanfordOpenClassroomIE(InfoExtractor):
3015     """Information extractor for Stanford's Open ClassRoom"""
3016
3017     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3018     IE_NAME = u'stanfordoc'
3019
3020     def report_download_webpage(self, objid):
3021         """Report information extraction."""
3022         self.to_screen(u'%s: Downloading webpage' % objid)
3023
3024     def _real_extract(self, url):
3025         mobj = re.match(self._VALID_URL, url)
3026         if mobj is None:
3027             raise ExtractorError(u'Invalid URL: %s' % url)
3028
3029         if mobj.group('course') and mobj.group('video'): # A specific video
3030             course = mobj.group('course')
3031             video = mobj.group('video')
3032             info = {
3033                 'id': course + '_' + video,
3034                 'uploader': None,
3035                 'upload_date': None,
3036             }
3037
3038             self.report_extraction(info['id'])
3039             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3040             xmlUrl = baseUrl + video + '.xml'
3041             try:
3042                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3043             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3044                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3045                 return
3046             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3047             try:
3048                 info['title'] = mdoc.findall('./title')[0].text
3049                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3050             except IndexError:
3051                 self._downloader.report_error(u'Invalid metadata XML file')
3052                 return
3053             info['ext'] = info['url'].rpartition('.')[2]
3054             return [info]
3055         elif mobj.group('course'): # A course page
3056             course = mobj.group('course')
3057             info = {
3058                 'id': course,
3059                 'type': 'playlist',
3060                 'uploader': None,
3061                 'upload_date': None,
3062             }
3063
3064             coursepage = self._download_webpage(url, info['id'],
3065                                         note='Downloading course info page',
3066                                         errnote='Unable to download course info page')
3067
3068             m = re.search('<h1>([^<]+)</h1>', coursepage)
3069             if m:
3070                 info['title'] = unescapeHTML(m.group(1))
3071             else:
3072                 info['title'] = info['id']
3073
3074             m = re.search('<description>([^<]+)</description>', coursepage)
3075             if m:
3076                 info['description'] = unescapeHTML(m.group(1))
3077
3078             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3079             info['list'] = [
3080                 {
3081                     'type': 'reference',
3082                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3083                 }
3084                     for vpage in links]
3085             results = []
3086             for entry in info['list']:
3087                 assert entry['type'] == 'reference'
3088                 results += self.extract(entry['url'])
3089             return results
3090         else: # Root page
3091             info = {
3092                 'id': 'Stanford OpenClassroom',
3093                 'type': 'playlist',
3094                 'uploader': None,
3095                 'upload_date': None,
3096             }
3097
3098             self.report_download_webpage(info['id'])
3099             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3100             try:
3101                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3102             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3104                 return
3105
3106             info['title'] = info['id']
3107
3108             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3109             info['list'] = [
3110                 {
3111                     'type': 'reference',
3112                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3113                 }
3114                     for cpage in links]
3115
3116             results = []
3117             for entry in info['list']:
3118                 assert entry['type'] == 'reference'
3119                 results += self.extract(entry['url'])
3120             return results
3121
3122 class MTVIE(InfoExtractor):
3123     """Information extractor for MTV.com"""
3124
3125     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3126     IE_NAME = u'mtv'
3127
3128     def _real_extract(self, url):
3129         mobj = re.match(self._VALID_URL, url)
3130         if mobj is None:
3131             self._downloader.report_error(u'invalid URL: %s' % url)
3132             return
3133         if not mobj.group('proto'):
3134             url = 'http://' + url
3135         video_id = mobj.group('videoid')
3136
3137         webpage = self._download_webpage(url, video_id)
3138
3139         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3140         if mobj is None:
3141             self._downloader.report_error(u'unable to extract song name')
3142             return
3143         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3144         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3145         if mobj is None:
3146             self._downloader.report_error(u'unable to extract performer')
3147             return
3148         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3149         video_title = performer + ' - ' + song_name
3150
3151         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3152         if mobj is None:
3153             self._downloader.report_error(u'unable to mtvn_uri')
3154             return
3155         mtvn_uri = mobj.group(1)
3156
3157         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3158         if mobj is None:
3159             self._downloader.report_error(u'unable to extract content id')
3160             return
3161         content_id = mobj.group(1)
3162
3163         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3164         self.report_extraction(video_id)
3165         request = compat_urllib_request.Request(videogen_url)
3166         try:
3167             metadataXml = compat_urllib_request.urlopen(request).read()
3168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3170             return
3171
3172         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3173         renditions = mdoc.findall('.//rendition')
3174
3175         # For now, always pick the highest quality.
3176         rendition = renditions[-1]
3177
3178         try:
3179             _,_,ext = rendition.attrib['type'].partition('/')
3180             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3181             video_url = rendition.find('./src').text
3182         except KeyError:
3183             self._downloader.trouble('Invalid rendition field.')
3184             return
3185
3186         info = {
3187             'id': video_id,
3188             'url': video_url,
3189             'uploader': performer,
3190             'upload_date': None,
3191             'title': video_title,
3192             'ext': ext,
3193             'format': format,
3194         }
3195
3196         return [info]
3197
3198
3199 class YoukuIE(InfoExtractor):
3200     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3201
3202     def report_download_webpage(self, file_id):
3203         """Report webpage download."""
3204         self.to_screen(u'%s: Downloading webpage' % file_id)
3205
3206     def _gen_sid(self):
3207         nowTime = int(time.time() * 1000)
3208         random1 = random.randint(1000,1998)
3209         random2 = random.randint(1000,9999)
3210
3211         return "%d%d%d" %(nowTime,random1,random2)
3212
3213     def _get_file_ID_mix_string(self, seed):
3214         mixed = []
3215         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3216         seed = float(seed)
3217         for i in range(len(source)):
3218             seed  =  (seed * 211 + 30031 ) % 65536
3219             index  =  math.floor(seed / 65536 * len(source) )
3220             mixed.append(source[int(index)])
3221             source.remove(source[int(index)])
3222         #return ''.join(mixed)
3223         return mixed
3224
3225     def _get_file_id(self, fileId, seed):
3226         mixed = self._get_file_ID_mix_string(seed)
3227         ids = fileId.split('*')
3228         realId = []
3229         for ch in ids:
3230             if ch:
3231                 realId.append(mixed[int(ch)])
3232         return ''.join(realId)
3233
3234     def _real_extract(self, url):
3235         mobj = re.match(self._VALID_URL, url)
3236         if mobj is None:
3237             self._downloader.report_error(u'invalid URL: %s' % url)
3238             return
3239         video_id = mobj.group('ID')
3240
3241         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3242
3243         request = compat_urllib_request.Request(info_url, None, std_headers)
3244         try:
3245             self.report_download_webpage(video_id)
3246             jsondata = compat_urllib_request.urlopen(request).read()
3247         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3248             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3249             return
3250
3251         self.report_extraction(video_id)
3252         try:
3253             jsonstr = jsondata.decode('utf-8')
3254             config = json.loads(jsonstr)
3255
3256             video_title =  config['data'][0]['title']
3257             seed = config['data'][0]['seed']
3258
3259             format = self._downloader.params.get('format', None)
3260             supported_format = list(config['data'][0]['streamfileids'].keys())
3261
3262             if format is None or format == 'best':
3263                 if 'hd2' in supported_format:
3264                     format = 'hd2'
3265                 else:
3266                     format = 'flv'
3267                 ext = u'flv'
3268             elif format == 'worst':
3269                 format = 'mp4'
3270                 ext = u'mp4'
3271             else:
3272                 format = 'flv'
3273                 ext = u'flv'
3274
3275
3276             fileid = config['data'][0]['streamfileids'][format]
3277             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3278         except (UnicodeDecodeError, ValueError, KeyError):
3279             self._downloader.report_error(u'unable to extract info section')
3280             return
3281
3282         files_info=[]
3283         sid = self._gen_sid()
3284         fileid = self._get_file_id(fileid, seed)
3285
3286         #column 8,9 of fileid represent the segment number
3287         #fileid[7:9] should be changed
3288         for index, key in enumerate(keys):
3289
3290             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3291             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3292
3293             info = {
3294                 'id': '%s_part%02d' % (video_id, index),
3295                 'url': download_url,
3296                 'uploader': None,
3297                 'upload_date': None,
3298                 'title': video_title,
3299                 'ext': ext,
3300             }
3301             files_info.append(info)
3302
3303         return files_info
3304
3305
3306 class XNXXIE(InfoExtractor):
3307     """Information extractor for xnxx.com"""
3308
3309     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3310     IE_NAME = u'xnxx'
3311     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3312     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3313     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3314
3315     def report_webpage(self, video_id):
3316         """Report information extraction"""
3317         self.to_screen(u'%s: Downloading webpage' % video_id)
3318
3319     def _real_extract(self, url):
3320         mobj = re.match(self._VALID_URL, url)
3321         if mobj is None:
3322             self._downloader.report_error(u'invalid URL: %s' % url)
3323             return
3324         video_id = mobj.group(1)
3325
3326         self.report_webpage(video_id)
3327
3328         # Get webpage content
3329         try:
3330             webpage_bytes = compat_urllib_request.urlopen(url).read()
3331             webpage = webpage_bytes.decode('utf-8')
3332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3334             return
3335
3336         result = re.search(self.VIDEO_URL_RE, webpage)
3337         if result is None:
3338             self._downloader.report_error(u'unable to extract video url')
3339             return
3340         video_url = compat_urllib_parse.unquote(result.group(1))
3341
3342         result = re.search(self.VIDEO_TITLE_RE, webpage)
3343         if result is None:
3344             self._downloader.report_error(u'unable to extract video title')
3345             return
3346         video_title = result.group(1)
3347
3348         result = re.search(self.VIDEO_THUMB_RE, webpage)
3349         if result is None:
3350             self._downloader.report_error(u'unable to extract video thumbnail')
3351             return
3352         video_thumbnail = result.group(1)
3353
3354         return [{
3355             'id': video_id,
3356             'url': video_url,
3357             'uploader': None,
3358             'upload_date': None,
3359             'title': video_title,
3360             'ext': 'flv',
3361             'thumbnail': video_thumbnail,
3362             'description': None,
3363         }]
3364
3365
3366 class GooglePlusIE(InfoExtractor):
3367     """Information extractor for plus.google.com."""
3368
3369     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3370     IE_NAME = u'plus.google'
3371
3372     def __init__(self, downloader=None):
3373         InfoExtractor.__init__(self, downloader)
3374
3375     def report_extract_entry(self, url):
3376         """Report downloading extry"""
3377         self.to_screen(u'Downloading entry: %s' % url)
3378
3379     def report_date(self, upload_date):
3380         """Report downloading extry"""
3381         self.to_screen(u'Entry date: %s' % upload_date)
3382
3383     def report_uploader(self, uploader):
3384         """Report downloading extry"""
3385         self.to_screen(u'Uploader: %s' % uploader)
3386
3387     def report_title(self, video_title):
3388         """Report downloading extry"""
3389         self.to_screen(u'Title: %s' % video_title)
3390
3391     def report_extract_vid_page(self, video_page):
3392         """Report information extraction."""
3393         self.to_screen(u'Extracting video page: %s' % video_page)
3394
3395     def _real_extract(self, url):
3396         # Extract id from URL
3397         mobj = re.match(self._VALID_URL, url)
3398         if mobj is None:
3399             self._downloader.report_error(u'Invalid URL: %s' % url)
3400             return
3401
3402         post_url = mobj.group(0)
3403         video_id = mobj.group(1)
3404
3405         video_extension = 'flv'
3406
3407         # Step 1, Retrieve post webpage to extract further information
3408         self.report_extract_entry(post_url)
3409         request = compat_urllib_request.Request(post_url)
3410         try:
3411             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3414             return
3415
3416         # Extract update date
3417         upload_date = None
3418         pattern = 'title="Timestamp">(.*?)</a>'
3419         mobj = re.search(pattern, webpage)
3420         if mobj:
3421             upload_date = mobj.group(1)
3422             # Convert timestring to a format suitable for filename
3423             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3424             upload_date = upload_date.strftime('%Y%m%d')
3425         self.report_date(upload_date)
3426
3427         # Extract uploader
3428         uploader = None
3429         pattern = r'rel\="author".*?>(.*?)</a>'
3430         mobj = re.search(pattern, webpage)
3431         if mobj:
3432             uploader = mobj.group(1)
3433         self.report_uploader(uploader)
3434
3435         # Extract title
3436         # Get the first line for title
3437         video_title = u'NA'
3438         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3439         mobj = re.search(pattern, webpage)
3440         if mobj:
3441             video_title = mobj.group(1)
3442         self.report_title(video_title)
3443
3444         # Step 2, Stimulate clicking the image box to launch video
3445         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3446         mobj = re.search(pattern, webpage)
3447         if mobj is None:
3448             self._downloader.report_error(u'unable to extract video page URL')
3449
3450         video_page = mobj.group(1)
3451         request = compat_urllib_request.Request(video_page)
3452         try:
3453             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3455             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3456             return
3457         self.report_extract_vid_page(video_page)
3458
3459
3460         # Extract video links on video page
3461         """Extract video links of all sizes"""
3462         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3463         mobj = re.findall(pattern, webpage)
3464         if len(mobj) == 0:
3465             self._downloader.report_error(u'unable to extract video links')
3466
3467         # Sort in resolution
3468         links = sorted(mobj)
3469
3470         # Choose the lowest of the sort, i.e. highest resolution
3471         video_url = links[-1]
3472         # Only get the url. The resolution part in the tuple has no use anymore
3473         video_url = video_url[-1]
3474         # Treat escaped \u0026 style hex
3475         try:
3476             video_url = video_url.decode("unicode_escape")
3477         except AttributeError: # Python 3
3478             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3479
3480
3481         return [{
3482             'id':       video_id,
3483             'url':      video_url,
3484             'uploader': uploader,
3485             'upload_date':  upload_date,
3486             'title':    video_title,
3487             'ext':      video_extension,
3488         }]
3489
3490 class NBAIE(InfoExtractor):
3491     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3492     IE_NAME = u'nba'
3493
3494     def _real_extract(self, url):
3495         mobj = re.match(self._VALID_URL, url)
3496         if mobj is None:
3497             self._downloader.report_error(u'invalid URL: %s' % url)
3498             return
3499
3500         video_id = mobj.group(1)
3501         if video_id.endswith('/index.html'):
3502             video_id = video_id[:-len('/index.html')]
3503
3504         webpage = self._download_webpage(url, video_id)
3505
3506         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3507         def _findProp(rexp, default=None):
3508             m = re.search(rexp, webpage)
3509             if m:
3510                 return unescapeHTML(m.group(1))
3511             else:
3512                 return default
3513
3514         shortened_video_id = video_id.rpartition('/')[2]
3515         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3516         info = {
3517             'id': shortened_video_id,
3518             'url': video_url,
3519             'ext': 'mp4',
3520             'title': title,
3521             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3522             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3523         }
3524         return [info]
3525
3526 class JustinTVIE(InfoExtractor):
3527     """Information extractor for justin.tv and twitch.tv"""
3528     # TODO: One broadcast may be split into multiple videos. The key
3529     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3530     # starts at 1 and increases. Can we treat all parts as one video?
3531
3532     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3533         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3534     _JUSTIN_PAGE_LIMIT = 100
3535     IE_NAME = u'justin.tv'
3536
3537     def report_download_page(self, channel, offset):
3538         """Report attempt to download a single page of videos."""
3539         self.to_screen(u'%s: Downloading video information from %d to %d' %
3540                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3541
3542     # Return count of items, list of *valid* items
3543     def _parse_page(self, url):
3544         try:
3545             urlh = compat_urllib_request.urlopen(url)
3546             webpage_bytes = urlh.read()
3547             webpage = webpage_bytes.decode('utf-8', 'ignore')
3548         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3549             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3550             return
3551
3552         response = json.loads(webpage)
3553         if type(response) != list:
3554             error_text = response.get('error', 'unknown error')
3555             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3556             return
3557         info = []
3558         for clip in response:
3559             video_url = clip['video_file_url']
3560             if video_url:
3561                 video_extension = os.path.splitext(video_url)[1][1:]
3562                 video_date = re.sub('-', '', clip['start_time'][:10])
3563                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3564                 video_id = clip['id']
3565                 video_title = clip.get('title', video_id)
3566                 info.append({
3567                     'id': video_id,
3568                     'url': video_url,
3569                     'title': video_title,
3570                     'uploader': clip.get('channel_name', video_uploader_id),
3571                     'uploader_id': video_uploader_id,
3572                     'upload_date': video_date,
3573                     'ext': video_extension,
3574                 })
3575         return (len(response), info)
3576
3577     def _real_extract(self, url):
3578         mobj = re.match(self._VALID_URL, url)
3579         if mobj is None:
3580             self._downloader.report_error(u'invalid URL: %s' % url)
3581             return
3582
3583         api = 'http://api.justin.tv'
3584         video_id = mobj.group(mobj.lastindex)
3585         paged = False
3586         if mobj.lastindex == 1:
3587             paged = True
3588             api += '/channel/archives/%s.json'
3589         else:
3590             api += '/broadcast/by_archive/%s.json'
3591         api = api % (video_id,)
3592
3593         self.report_extraction(video_id)
3594
3595         info = []
3596         offset = 0
3597         limit = self._JUSTIN_PAGE_LIMIT
3598         while True:
3599             if paged:
3600                 self.report_download_page(video_id, offset)
3601             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3602             page_count, page_info = self._parse_page(page_url)
3603             info.extend(page_info)
3604             if not paged or page_count != limit:
3605                 break
3606             offset += limit
3607         return info
3608
3609 class FunnyOrDieIE(InfoExtractor):
3610     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3611
3612     def _real_extract(self, url):
3613         mobj = re.match(self._VALID_URL, url)
3614         if mobj is None:
3615             self._downloader.report_error(u'invalid URL: %s' % url)
3616             return
3617
3618         video_id = mobj.group('id')
3619         webpage = self._download_webpage(url, video_id)
3620
3621         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3622         if not m:
3623             self._downloader.report_error(u'unable to find video information')
3624         video_url = unescapeHTML(m.group('url'))
3625
3626         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3627         if not m:
3628             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3629             if not m:
3630                 self._downloader.trouble(u'Cannot find video title')
3631         title = clean_html(m.group('title'))
3632
3633         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3634         if m:
3635             desc = unescapeHTML(m.group('desc'))
3636         else:
3637             desc = None
3638
3639         info = {
3640             'id': video_id,
3641             'url': video_url,
3642             'ext': 'mp4',
3643             'title': title,
3644             'description': desc,
3645         }
3646         return [info]
3647
3648 class SteamIE(InfoExtractor):
3649     _VALID_URL = r"""http://store.steampowered.com/
3650                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3651                 (?P<gameID>\d+)/?
3652                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3653                 """
3654
3655     @classmethod
3656     def suitable(cls, url):
3657         """Receives a URL and returns True if suitable for this IE."""
3658         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3659
3660     def _real_extract(self, url):
3661         m = re.match(self._VALID_URL, url, re.VERBOSE)
3662         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3663         gameID = m.group('gameID')
3664         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3665         webpage = self._download_webpage(videourl, gameID)
3666         mweb = re.finditer(urlRE, webpage)
3667         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3668         titles = re.finditer(namesRE, webpage)
3669         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3670         thumbs = re.finditer(thumbsRE, webpage)
3671         videos = []
3672         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3673             video_id = vid.group('videoID')
3674             title = vtitle.group('videoName')
3675             video_url = vid.group('videoURL')
3676             video_thumb = thumb.group('thumbnail')
3677             if not video_url:
3678                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3679             info = {
3680                 'id':video_id,
3681                 'url':video_url,
3682                 'ext': 'flv',
3683                 'title': unescapeHTML(title),
3684                 'thumbnail': video_thumb
3685                   }
3686             videos.append(info)
3687         return videos
3688
3689 class UstreamIE(InfoExtractor):
3690     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3691     IE_NAME = u'ustream'
3692
3693     def _real_extract(self, url):
3694         m = re.match(self._VALID_URL, url)
3695         video_id = m.group('videoID')
3696         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3697         webpage = self._download_webpage(url, video_id)
3698         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3699         title = m.group('title')
3700         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3701         uploader = m.group('uploader')
3702         info = {
3703                 'id':video_id,
3704                 'url':video_url,
3705                 'ext': 'flv',
3706                 'title': title,
3707                 'uploader': uploader
3708                   }
3709         return [info]
3710
3711 class WorldStarHipHopIE(InfoExtractor):
3712     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3713     IE_NAME = u'WorldStarHipHop'
3714
3715     def _real_extract(self, url):
3716         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3717
3718         webpage_src = compat_urllib_request.urlopen(url).read()
3719         webpage_src = webpage_src.decode('utf-8')
3720
3721         mobj = re.search(_src_url, webpage_src)
3722
3723         m = re.match(self._VALID_URL, url)
3724         video_id = m.group('id')
3725
3726         if mobj is not None:
3727             video_url = mobj.group()
3728             if 'mp4' in video_url:
3729                 ext = 'mp4'
3730             else:
3731                 ext = 'flv'
3732         else:
3733             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3734             return
3735
3736         _title = r"""<title>(.*)</title>"""
3737
3738         mobj = re.search(_title, webpage_src)
3739
3740         if mobj is not None:
3741             title = mobj.group(1)
3742         else:
3743             title = 'World Start Hip Hop - %s' % time.ctime()
3744
3745         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3746         mobj = re.search(_thumbnail, webpage_src)
3747
3748         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3749         if mobj is not None:
3750             thumbnail = mobj.group(1)
3751         else:
3752             _title = r"""candytitles.*>(.*)</span>"""
3753             mobj = re.search(_title, webpage_src)
3754             if mobj is not None:
3755                 title = mobj.group(1)
3756             thumbnail = None
3757
3758         results = [{
3759                     'id': video_id,
3760                     'url' : video_url,
3761                     'title' : title,
3762                     'thumbnail' : thumbnail,
3763                     'ext' : ext,
3764                     }]
3765         return results
3766
3767 class RBMARadioIE(InfoExtractor):
3768     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3769
3770     def _real_extract(self, url):
3771         m = re.match(self._VALID_URL, url)
3772         video_id = m.group('videoID')
3773
3774         webpage = self._download_webpage(url, video_id)
3775         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3776         if not m:
3777             raise ExtractorError(u'Cannot find metadata')
3778         json_data = m.group(1)
3779
3780         try:
3781             data = json.loads(json_data)
3782         except ValueError as e:
3783             raise ExtractorError(u'Invalid JSON: ' + str(e))
3784
3785         video_url = data['akamai_url'] + '&cbr=256'
3786         url_parts = compat_urllib_parse_urlparse(video_url)
3787         video_ext = url_parts.path.rpartition('.')[2]
3788         info = {
3789                 'id': video_id,
3790                 'url': video_url,
3791                 'ext': video_ext,
3792                 'title': data['title'],
3793                 'description': data.get('teaser_text'),
3794                 'location': data.get('country_of_origin'),
3795                 'uploader': data.get('host', {}).get('name'),
3796                 'uploader_id': data.get('host', {}).get('slug'),
3797                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3798                 'duration': data.get('duration'),
3799         }
3800         return [info]
3801
3802
3803 class YouPornIE(InfoExtractor):
3804     """Information extractor for youporn.com."""
3805     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3806
3807     def _print_formats(self, formats):
3808         """Print all available formats"""
3809         print(u'Available formats:')
3810         print(u'ext\t\tformat')
3811         print(u'---------------------------------')
3812         for format in formats:
3813             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3814
3815     def _specific(self, req_format, formats):
3816         for x in formats:
3817             if(x["format"]==req_format):
3818                 return x
3819         return None
3820
3821     def _real_extract(self, url):
3822         mobj = re.match(self._VALID_URL, url)
3823         if mobj is None:
3824             self._downloader.report_error(u'invalid URL: %s' % url)
3825             return
3826
3827         video_id = mobj.group('videoid')
3828
3829         req = compat_urllib_request.Request(url)
3830         req.add_header('Cookie', 'age_verified=1')
3831         webpage = self._download_webpage(req, video_id)
3832
3833         # Get the video title
3834         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3835         if result is None:
3836             raise ExtractorError(u'Unable to extract video title')
3837         video_title = result.group('title').strip()
3838
3839         # Get the video date
3840         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3841         if result is None:
3842             self._downloader.report_warning(u'unable to extract video date')
3843             upload_date = None
3844         else:
3845             upload_date = result.group('date').strip()
3846
3847         # Get the video uploader
3848         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3849         if result is None:
3850             self._downloader.report_warning(u'unable to extract uploader')
3851             video_uploader = None
3852         else:
3853             video_uploader = result.group('uploader').strip()
3854             video_uploader = clean_html( video_uploader )
3855
3856         # Get all of the formats available
3857         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3858         result = re.search(DOWNLOAD_LIST_RE, webpage)
3859         if result is None:
3860             raise ExtractorError(u'Unable to extract download list')
3861         download_list_html = result.group('download_list').strip()
3862
3863         # Get all of the links from the page
3864         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3865         links = re.findall(LINK_RE, download_list_html)
3866         if(len(links) == 0):
3867             raise ExtractorError(u'ERROR: no known formats available for video')
3868
3869         self.to_screen(u'Links found: %d' % len(links))
3870
3871         formats = []
3872         for link in links:
3873
3874             # A link looks like this:
3875             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3876             # A path looks like this:
3877             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3878             video_url = unescapeHTML( link )
3879             path = compat_urllib_parse_urlparse( video_url ).path
3880             extension = os.path.splitext( path )[1][1:]
3881             format = path.split('/')[4].split('_')[:2]
3882             size = format[0]
3883             bitrate = format[1]
3884             format = "-".join( format )
3885             title = u'%s-%s-%s' % (video_title, size, bitrate)
3886
3887             formats.append({
3888                 'id': video_id,
3889                 'url': video_url,
3890                 'uploader': video_uploader,
3891                 'upload_date': upload_date,
3892                 'title': title,
3893                 'ext': extension,
3894                 'format': format,
3895                 'thumbnail': None,
3896                 'description': None,
3897                 'player_url': None
3898             })
3899
3900         if self._downloader.params.get('listformats', None):
3901             self._print_formats(formats)
3902             return
3903
3904         req_format = self._downloader.params.get('format', None)
3905         self.to_screen(u'Format: %s' % req_format)
3906
3907         if req_format is None or req_format == 'best':
3908             return [formats[0]]
3909         elif req_format == 'worst':
3910             return [formats[-1]]
3911         elif req_format in ('-1', 'all'):
3912             return formats
3913         else:
3914             format = self._specific( req_format, formats )
3915             if result is None:
3916                 self._downloader.report_error(u'requested format not available')
3917                 return
3918             return [format]
3919
3920
3921
3922 class PornotubeIE(InfoExtractor):
3923     """Information extractor for pornotube.com."""
3924     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3925
3926     def _real_extract(self, url):
3927         mobj = re.match(self._VALID_URL, url)
3928         if mobj is None:
3929             self._downloader.report_error(u'invalid URL: %s' % url)
3930             return
3931
3932         video_id = mobj.group('videoid')
3933         video_title = mobj.group('title')
3934
3935         # Get webpage content
3936         webpage = self._download_webpage(url, video_id)
3937
3938         # Get the video URL
3939         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3940         result = re.search(VIDEO_URL_RE, webpage)
3941         if result is None:
3942             self._downloader.report_error(u'unable to extract video url')
3943             return
3944         video_url = compat_urllib_parse.unquote(result.group('url'))
3945
3946         #Get the uploaded date
3947         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3948         result = re.search(VIDEO_UPLOADED_RE, webpage)
3949         if result is None:
3950             self._downloader.report_error(u'unable to extract video title')
3951             return
3952         upload_date = result.group('date')
3953
3954         info = {'id': video_id,
3955                 'url': video_url,
3956                 'uploader': None,
3957                 'upload_date': upload_date,
3958                 'title': video_title,
3959                 'ext': 'flv',
3960                 'format': 'flv'}
3961
3962         return [info]
3963
3964 class YouJizzIE(InfoExtractor):
3965     """Information extractor for youjizz.com."""
3966     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3967
3968     def _real_extract(self, url):
3969         mobj = re.match(self._VALID_URL, url)
3970         if mobj is None:
3971             self._downloader.report_error(u'invalid URL: %s' % url)
3972             return
3973
3974         video_id = mobj.group('videoid')
3975
3976         # Get webpage content
3977         webpage = self._download_webpage(url, video_id)
3978
3979         # Get the video title
3980         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3981         if result is None:
3982             raise ExtractorError(u'ERROR: unable to extract video title')
3983         video_title = result.group('title').strip()
3984
3985         # Get the embed page
3986         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3987         if result is None:
3988             raise ExtractorError(u'ERROR: unable to extract embed page')
3989
3990         embed_page_url = result.group(0).strip()
3991         video_id = result.group('videoid')
3992
3993         webpage = self._download_webpage(embed_page_url, video_id)
3994
3995         # Get the video URL
3996         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3997         if result is None:
3998             raise ExtractorError(u'ERROR: unable to extract video url')
3999         video_url = result.group('source')
4000
4001         info = {'id': video_id,
4002                 'url': video_url,
4003                 'title': video_title,
4004                 'ext': 'flv',
4005                 'format': 'flv',
4006                 'player_url': embed_page_url}
4007
4008         return [info]
4009
4010 class EightTracksIE(InfoExtractor):
4011     IE_NAME = '8tracks'
4012     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4013
4014     def _real_extract(self, url):
4015         mobj = re.match(self._VALID_URL, url)
4016         if mobj is None:
4017             raise ExtractorError(u'Invalid URL: %s' % url)
4018         playlist_id = mobj.group('id')
4019
4020         webpage = self._download_webpage(url, playlist_id)
4021
4022         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4023         if not m:
4024             raise ExtractorError(u'Cannot find trax information')
4025         json_like = m.group(1)
4026         data = json.loads(json_like)
4027
4028         session = str(random.randint(0, 1000000000))
4029         mix_id = data['id']
4030         track_count = data['tracks_count']
4031         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4032         next_url = first_url
4033         res = []
4034         for i in itertools.count():
4035             api_json = self._download_webpage(next_url, playlist_id,
4036                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4037                 errnote=u'Failed to download song information')
4038             api_data = json.loads(api_json)
4039             track_data = api_data[u'set']['track']
4040             info = {
4041                 'id': track_data['id'],
4042                 'url': track_data['track_file_stream_url'],
4043                 'title': track_data['performer'] + u' - ' + track_data['name'],
4044                 'raw_title': track_data['name'],
4045                 'uploader_id': data['user']['login'],
4046                 'ext': 'm4a',
4047             }
4048             res.append(info)
4049             if api_data['set']['at_last_track']:
4050                 break
4051             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4052         return res
4053
4054 class KeekIE(InfoExtractor):
4055     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4056     IE_NAME = u'keek'
4057
4058     def _real_extract(self, url):
4059         m = re.match(self._VALID_URL, url)
4060         video_id = m.group('videoID')
4061         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4062         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4063         webpage = self._download_webpage(url, video_id)
4064         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4065         title = unescapeHTML(m.group('title'))
4066         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4067         uploader = clean_html(m.group('uploader'))
4068         info = {
4069                 'id': video_id,
4070                 'url': video_url,
4071                 'ext': 'mp4',
4072                 'title': title,
4073                 'thumbnail': thumbnail,
4074                 'uploader': uploader
4075         }
4076         return [info]
4077
4078 class TEDIE(InfoExtractor):
4079     _VALID_URL=r'''http://www.ted.com/
4080                    (
4081                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4082                         |
4083                         ((?P<type_talk>talks)) # We have a simple talk
4084                    )
4085                    /(?P<name>\w+) # Here goes the name and then ".html"
4086                    '''
4087
4088     @classmethod
4089     def suitable(cls, url):
4090         """Receives a URL and returns True if suitable for this IE."""
4091         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4092
4093     def _real_extract(self, url):
4094         m=re.match(self._VALID_URL, url, re.VERBOSE)
4095         if m.group('type_talk'):
4096             return [self._talk_info(url)]
4097         else :
4098             playlist_id=m.group('playlist_id')
4099             name=m.group('name')
4100             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4101             return [self._playlist_videos_info(url,name,playlist_id)]
4102
4103     def _talk_video_link(self,mediaSlug):
4104         '''Returns the video link for that mediaSlug'''
4105         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4106
4107     def _playlist_videos_info(self,url,name,playlist_id=0):
4108         '''Returns the videos of the playlist'''
4109         video_RE=r'''
4110                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4111                      ([.\s]*?)data-playlist_item_id="(\d+)"
4112                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4113                      '''
4114         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4115         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4116         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4117         m_names=re.finditer(video_name_RE,webpage)
4118
4119         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4120         m_playlist = re.search(playlist_RE, webpage)
4121         playlist_title = m_playlist.group('playlist_title')
4122
4123         playlist_entries = []
4124         for m_video, m_name in zip(m_videos,m_names):
4125             video_id=m_video.group('video_id')
4126             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4127             playlist_entries.append(self.url_result(talk_url, 'TED'))
4128         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4129
4130     def _talk_info(self, url, video_id=0):
4131         """Return the video for the talk in the url"""
4132         m=re.match(self._VALID_URL, url,re.VERBOSE)
4133         videoName=m.group('name')
4134         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4135         # If the url includes the language we get the title translated
4136         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4137         title=re.search(title_RE, webpage).group('title')
4138         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4139                         "id":(?P<videoID>[\d]+).*?
4140                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4141         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4142         thumb_match=re.search(thumb_RE,webpage)
4143         info_match=re.search(info_RE,webpage,re.VERBOSE)
4144         video_id=info_match.group('videoID')
4145         mediaSlug=info_match.group('mediaSlug')
4146         video_url=self._talk_video_link(mediaSlug)
4147         info = {
4148                 'id': video_id,
4149                 'url': video_url,
4150                 'ext': 'mp4',
4151                 'title': title,
4152                 'thumbnail': thumb_match.group('thumbnail')
4153                 }
4154         return info
4155
4156 class MySpassIE(InfoExtractor):
4157     _VALID_URL = r'http://www.myspass.de/.*'
4158
4159     def _real_extract(self, url):
4160         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4161
4162         # video id is the last path element of the URL
4163         # usually there is a trailing slash, so also try the second but last
4164         url_path = compat_urllib_parse_urlparse(url).path
4165         url_parent_path, video_id = os.path.split(url_path)
4166         if not video_id:
4167             _, video_id = os.path.split(url_parent_path)
4168
4169         # get metadata
4170         metadata_url = META_DATA_URL_TEMPLATE % video_id
4171         metadata_text = self._download_webpage(metadata_url, video_id)
4172         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4173
4174         # extract values from metadata
4175         url_flv_el = metadata.find('url_flv')
4176         if url_flv_el is None:
4177             self._downloader.report_error(u'unable to extract download url')
4178             return
4179         video_url = url_flv_el.text
4180         extension = os.path.splitext(video_url)[1][1:]
4181         title_el = metadata.find('title')
4182         if title_el is None:
4183             self._downloader.report_error(u'unable to extract title')
4184             return
4185         title = title_el.text
4186         format_id_el = metadata.find('format_id')
4187         if format_id_el is None:
4188             format = ext
4189         else:
4190             format = format_id_el.text
4191         description_el = metadata.find('description')
4192         if description_el is not None:
4193             description = description_el.text
4194         else:
4195             description = None
4196         imagePreview_el = metadata.find('imagePreview')
4197         if imagePreview_el is not None:
4198             thumbnail = imagePreview_el.text
4199         else:
4200             thumbnail = None
4201         info = {
4202             'id': video_id,
4203             'url': video_url,
4204             'title': title,
4205             'ext': extension,
4206             'format': format,
4207             'thumbnail': thumbnail,
4208             'description': description
4209         }
4210         return [info]
4211
4212 class SpiegelIE(InfoExtractor):
4213     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4214
4215     def _real_extract(self, url):
4216         m = re.match(self._VALID_URL, url)
4217         video_id = m.group('videoID')
4218
4219         webpage = self._download_webpage(url, video_id)
4220         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4221         if not m:
4222             raise ExtractorError(u'Cannot find title')
4223         video_title = unescapeHTML(m.group(1))
4224
4225         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4226         xml_code = self._download_webpage(xml_url, video_id,
4227                     note=u'Downloading XML', errnote=u'Failed to download XML')
4228
4229         idoc = xml.etree.ElementTree.fromstring(xml_code)
4230         last_type = idoc[-1]
4231         filename = last_type.findall('./filename')[0].text
4232         duration = float(last_type.findall('./duration')[0].text)
4233
4234         video_url = 'http://video2.spiegel.de/flash/' + filename
4235         video_ext = filename.rpartition('.')[2]
4236         info = {
4237             'id': video_id,
4238             'url': video_url,
4239             'ext': video_ext,
4240             'title': video_title,
4241             'duration': duration,
4242         }
4243         return [info]
4244
4245 class LiveLeakIE(InfoExtractor):
4246
4247     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4248     IE_NAME = u'liveleak'
4249
4250     def _real_extract(self, url):
4251         mobj = re.match(self._VALID_URL, url)
4252         if mobj is None:
4253             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4254             return
4255
4256         video_id = mobj.group('video_id')
4257
4258         webpage = self._download_webpage(url, video_id)
4259
4260         m = re.search(r'file: "(.*?)",', webpage)
4261         if not m:
4262             self._downloader.report_error(u'unable to find video url')
4263             return
4264         video_url = m.group(1)
4265
4266         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4267         if not m:
4268             self._downloader.trouble(u'Cannot find video title')
4269         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4270
4271         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4272         if m:
4273             desc = unescapeHTML(m.group('desc'))
4274         else:
4275             desc = None
4276
4277         m = re.search(r'By:.*?(\w+)</a>', webpage)
4278         if m:
4279             uploader = clean_html(m.group(1))
4280         else:
4281             uploader = None
4282
4283         info = {
4284             'id':  video_id,
4285             'url': video_url,
4286             'ext': 'mp4',
4287             'title': title,
4288             'description': desc,
4289             'uploader': uploader
4290         }
4291
4292         return [info]
4293
4294 class ARDIE(InfoExtractor):
4295     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4296     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4297     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4298
4299     def _real_extract(self, url):
4300         # determine video id from url
4301         m = re.match(self._VALID_URL, url)
4302
4303         numid = re.search(r'documentId=([0-9]+)', url)
4304         if numid:
4305             video_id = numid.group(1)
4306         else:
4307             video_id = m.group('video_id')
4308
4309         # determine title and media streams from webpage
4310         html = self._download_webpage(url, video_id)
4311         title = re.search(self._TITLE, html).group('title')
4312         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4313         if not streams:
4314             assert '"fsk"' in html
4315             self._downloader.report_error(u'this video is only available after 8:00 pm')
4316             return
4317
4318         # choose default media type and highest quality for now
4319         stream = max([s for s in streams if int(s["media_type"]) == 0],
4320                      key=lambda s: int(s["quality"]))
4321
4322         # there's two possibilities: RTMP stream or HTTP download
4323         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4324         if stream['rtmp_url']:
4325             self.to_screen(u'RTMP download detected')
4326             assert stream['video_url'].startswith('mp4:')
4327             info["url"] = stream["rtmp_url"]
4328             info["play_path"] = stream['video_url']
4329         else:
4330             assert stream["video_url"].endswith('.mp4')
4331             info["url"] = stream["video_url"]
4332         return [info]
4333
4334
4335 def gen_extractors():
4336     """ Return a list of an instance of every supported extractor.
4337     The order does matter; the first extractor matched is the one handling the URL.
4338     """
4339     return [
4340         YoutubePlaylistIE(),
4341         YoutubeChannelIE(),
4342         YoutubeUserIE(),
4343         YoutubeSearchIE(),
4344         YoutubeIE(),
4345         MetacafeIE(),
4346         DailymotionIE(),
4347         GoogleSearchIE(),
4348         PhotobucketIE(),
4349         YahooIE(),
4350         YahooSearchIE(),
4351         DepositFilesIE(),
4352         FacebookIE(),
4353         BlipTVUserIE(),
4354         BlipTVIE(),
4355         VimeoIE(),
4356         MyVideoIE(),
4357         ComedyCentralIE(),
4358         EscapistIE(),
4359         CollegeHumorIE(),
4360         XVideosIE(),
4361         SoundcloudSetIE(),
4362         SoundcloudIE(),
4363         InfoQIE(),
4364         MixcloudIE(),
4365         StanfordOpenClassroomIE(),
4366         MTVIE(),
4367         YoukuIE(),
4368         XNXXIE(),
4369         YouJizzIE(),
4370         PornotubeIE(),
4371         YouPornIE(),
4372         GooglePlusIE(),
4373         ArteTvIE(),
4374         NBAIE(),
4375         WorldStarHipHopIE(),
4376         JustinTVIE(),
4377         FunnyOrDieIE(),
4378         SteamIE(),
4379         UstreamIE(),
4380         RBMARadioIE(),
4381         EightTracksIE(),
4382         KeekIE(),
4383         TEDIE(),
4384         MySpassIE(),
4385         SpiegelIE(),
4386         LiveLeakIE(),
4387         ARDIE(),
4388         GenericIE()
4389     ]
4390
4391 def get_info_extractor(ie_name):
4392     """Returns the info extractor class with the given ie_name"""
4393     return globals()[ie_name+'IE']