Fix infoQ in Python3
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import netrc
9 import os
10 import re
11 import socket
12 import time
13 import email.utils
14 import xml.etree.ElementTree
15 import random
16 import math
17
18 from .utils import *
19
20
21 class InfoExtractor(object):
22     """Information Extractor class.
23
24     Information extractors are the classes that, given a URL, extract
25     information about the video (or videos) the URL refers to. This
26     information includes the real video URL, the video title, author and
27     others. The information is stored in a dictionary which is then
28     passed to the FileDownloader. The FileDownloader processes this
29     information possibly downloading the video to the file system, among
30     other possible outcomes.
31
32     The dictionaries must include the following fields:
33
34     id:             Video identifier.
35     url:            Final video URL.
36     title:          Video title, unescaped.
37     ext:            Video filename extension.
38     uploader:       Full name of the video uploader.
39     upload_date:    Video upload date (YYYYMMDD).
40
41     The following fields are optional:
42
43     format:         The video format, defaults to ext (used for --get-format)
44     thumbnail:      Full URL to a video thumbnail image.
45     description:    One-line video description.
46     uploader_id:    Nickname or id of the video uploader.
47     player_url:     SWF Player URL (used for rtmpdump).
48     subtitles:      The .srt file contents.
49     urlhandle:      [internal] The urlHandle to be used to download the file,
50                     like returned by urllib.request.urlopen
51
52     The fields should all be Unicode strings.
53
54     Subclasses of this one should re-define the _real_initialize() and
55     _real_extract() methods and define a _VALID_URL regexp.
56     Probably, they should also be added to the list of extractors.
57
58     _real_extract() must return a *list* of information dictionaries as
59     described above.
60
61     Finally, the _WORKING attribute should be set to False for broken IEs
62     in order to warn the users and skip the tests.
63     """
64
65     _ready = False
66     _downloader = None
67     _WORKING = True
68
69     def __init__(self, downloader=None):
70         """Constructor. Receives an optional downloader."""
71         self._ready = False
72         self.set_downloader(downloader)
73
74     def suitable(self, url):
75         """Receives a URL and returns True if suitable for this IE."""
76         return re.match(self._VALID_URL, url) is not None
77
78     def working(self):
79         """Getter method for _WORKING."""
80         return self._WORKING
81
82     def initialize(self):
83         """Initializes an instance (authentication, etc)."""
84         if not self._ready:
85             self._real_initialize()
86             self._ready = True
87
88     def extract(self, url):
89         """Extracts URL information and returns it in list of dicts."""
90         self.initialize()
91         return self._real_extract(url)
92
93     def set_downloader(self, downloader):
94         """Sets the downloader for this IE."""
95         self._downloader = downloader
96
97     def _real_initialize(self):
98         """Real initialization process. Redefine in subclasses."""
99         pass
100
101     def _real_extract(self, url):
102         """Real extraction process. Redefine in subclasses."""
103         pass
104
105     @property
106     def IE_NAME(self):
107         return type(self).__name__[:-2]
108
109     def _download_webpage(self, url, video_id, note=None, errnote=None):
110         if note is None:
111             note = u'Downloading video webpage'
112         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
113         try:
114             urlh = compat_urllib_request.urlopen(url)
115             webpage_bytes = urlh.read()
116             return webpage_bytes.decode('utf-8', 'replace')
117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118             if errnote is None:
119                 errnote = u'Unable to download webpage'
120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
121
122
123 class YoutubeIE(InfoExtractor):
124     """Information extractor for youtube.com."""
125
126     _VALID_URL = r"""^
127                      (
128                          (?:https?://)?                                       # http(s):// (optional)
129                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
131                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
132                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
133                          (?:                                                  # the various things that can precede the ID:
134                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
135                              |(?:                                             # or the v= param in all its forms
136                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
138                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
139                                  v=
140                              )
141                          )?                                                   # optional -> youtube.com/xxxx is OK
142                      )?                                                       # all until now is optional -> you can pass the naked ID
143                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
144                      (?(1).+)?                                                # if we found the ID, everything can follow
145                      $"""
146     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150     _NETRC_MACHINE = 'youtube'
151     # Listed in order of quality
152     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154     _video_extensions = {
155         '13': '3gp',
156         '17': 'mp4',
157         '18': 'mp4',
158         '22': 'mp4',
159         '37': 'mp4',
160         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
161         '43': 'webm',
162         '44': 'webm',
163         '45': 'webm',
164         '46': 'webm',
165     }
166     _video_dimensions = {
167         '5': '240x400',
168         '6': '???',
169         '13': '???',
170         '17': '144x176',
171         '18': '360x640',
172         '22': '720x1280',
173         '34': '360x640',
174         '35': '480x854',
175         '37': '1080x1920',
176         '38': '3072x4096',
177         '43': '360x640',
178         '44': '480x854',
179         '45': '720x1280',
180         '46': '1080x1920',
181     }
182     IE_NAME = u'youtube'
183
184     def suitable(self, url):
185         """Receives a URL and returns True if suitable for this IE."""
186         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
187
188     def report_lang(self):
189         """Report attempt to set language."""
190         self._downloader.to_screen(u'[youtube] Setting language')
191
192     def report_login(self):
193         """Report attempt to log in."""
194         self._downloader.to_screen(u'[youtube] Logging in')
195
196     def report_age_confirmation(self):
197         """Report attempt to confirm age."""
198         self._downloader.to_screen(u'[youtube] Confirming age')
199
200     def report_video_webpage_download(self, video_id):
201         """Report attempt to download video webpage."""
202         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
203
204     def report_video_info_webpage_download(self, video_id):
205         """Report attempt to download video info webpage."""
206         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
207
208     def report_video_subtitles_download(self, video_id):
209         """Report attempt to download video info webpage."""
210         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
211
212     def report_information_extraction(self, video_id):
213         """Report attempt to extract video information."""
214         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
215
216     def report_unavailable_format(self, video_id, format):
217         """Report extracted video URL."""
218         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
219
220     def report_rtmp_download(self):
221         """Indicate the download will use the RTMP protocol."""
222         self._downloader.to_screen(u'[youtube] RTMP download detected')
223
224     def _closed_captions_xml_to_srt(self, xml_string):
225         srt = ''
226         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227         # TODO parse xml instead of regex
228         for n, (start, dur_tag, dur, caption) in enumerate(texts):
229             if not dur: dur = '4'
230             start = float(start)
231             end = start + float(dur)
232             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234             caption = unescapeHTML(caption)
235             caption = unescapeHTML(caption) # double cycle, intentional
236             srt += str(n+1) + '\n'
237             srt += start + ' --> ' + end + '\n'
238             srt += caption + '\n\n'
239         return srt
240
241     def _extract_subtitles(self, video_id):
242         self.report_video_subtitles_download(video_id)
243         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
244         try:
245             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250         if not srt_lang_list:
251             return (u'WARNING: video has no closed captions', None)
252         if self._downloader.params.get('subtitleslang', False):
253             srt_lang = self._downloader.params.get('subtitleslang')
254         elif 'en' in srt_lang_list:
255             srt_lang = 'en'
256         else:
257             srt_lang = list(srt_lang_list.keys())[0]
258         if not srt_lang in srt_lang_list:
259             return (u'WARNING: no closed captions found in the specified language', None)
260         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
261         try:
262             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265         if not srt_xml:
266             return (u'WARNING: unable to download video subtitles', None)
267         return (None, self._closed_captions_xml_to_srt(srt_xml))
268
269     def _print_formats(self, formats):
270         print('Available formats:')
271         for x in formats:
272             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
273
274     def _real_initialize(self):
275         if self._downloader is None:
276             return
277
278         username = None
279         password = None
280         downloader_params = self._downloader.params
281
282         # Attempt to use provided username and password or .netrc data
283         if downloader_params.get('username', None) is not None:
284             username = downloader_params['username']
285             password = downloader_params['password']
286         elif downloader_params.get('usenetrc', False):
287             try:
288                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
289                 if info is not None:
290                     username = info[0]
291                     password = info[2]
292                 else:
293                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294             except (IOError, netrc.NetrcParseError) as err:
295                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
296                 return
297
298         # Set language
299         request = compat_urllib_request.Request(self._LANG_URL)
300         try:
301             self.report_lang()
302             compat_urllib_request.urlopen(request).read()
303         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
305             return
306
307         # No authentication to be performed
308         if username is None:
309             return
310
311         # Log in
312         login_form = {
313                 'current_form': 'loginForm',
314                 'next':     '/',
315                 'action_login': 'Log In',
316                 'username': username,
317                 'password': password,
318                 }
319         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
320         try:
321             self.report_login()
322             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
325                 return
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
328             return
329
330         # Confirm age
331         age_form = {
332                 'next_url':     '/',
333                 'action_confirm':   'Confirm',
334                 }
335         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
336         try:
337             self.report_age_confirmation()
338             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
341             return
342
343     def _extract_id(self, url):
344         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
345         if mobj is None:
346             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
347             return
348         video_id = mobj.group(2)
349         return video_id
350
351     def _real_extract(self, url):
352         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353         mobj = re.search(self._NEXT_URL_RE, url)
354         if mobj:
355             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356         video_id = self._extract_id(url)
357
358         # Get video webpage
359         self.report_video_webpage_download(video_id)
360         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361         request = compat_urllib_request.Request(url)
362         try:
363             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
366             return
367
368         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
369
370         # Attempt to extract SWF player URL
371         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
372         if mobj is not None:
373             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
374         else:
375             player_url = None
376
377         # Get video info
378         self.report_video_info_webpage_download(video_id)
379         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381                     % (video_id, el_type))
382             request = compat_urllib_request.Request(video_info_url)
383             try:
384                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386                 video_info = compat_parse_qs(video_info_webpage)
387                 if 'token' in video_info:
388                     break
389             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
391                 return
392         if 'token' not in video_info:
393             if 'reason' in video_info:
394                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
395             else:
396                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
397             return
398
399         # Check for "rental" videos
400         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401             self._downloader.trouble(u'ERROR: "rental" videos not supported')
402             return
403
404         # Start extracting information
405         self.report_information_extraction(video_id)
406
407         # uploader
408         if 'author' not in video_info:
409             self._downloader.trouble(u'ERROR: unable to extract uploader name')
410             return
411         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
412
413         # uploader_id
414         video_uploader_id = None
415         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
416         if mobj is not None:
417             video_uploader_id = mobj.group(1)
418         else:
419             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
420
421         # title
422         if 'title' not in video_info:
423             self._downloader.trouble(u'ERROR: unable to extract video title')
424             return
425         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
426
427         # thumbnail image
428         if 'thumbnail_url' not in video_info:
429             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
430             video_thumbnail = ''
431         else:   # don't panic if we can't find it
432             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
433
434         # upload date
435         upload_date = None
436         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
437         if mobj is not None:
438             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440             for expression in format_expressions:
441                 try:
442                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
443                 except:
444                     pass
445
446         # description
447         video_description = get_element_by_id("eow-description", video_webpage)
448         if video_description:
449             video_description = clean_html(video_description)
450         else:
451             video_description = ''
452
453         # closed captions
454         video_subtitles = None
455         if self._downloader.params.get('writesubtitles', False):
456             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
457             if srt_error:
458                 self._downloader.trouble(srt_error)
459
460         if 'length_seconds' not in video_info:
461             self._downloader.trouble(u'WARNING: unable to extract video duration')
462             video_duration = ''
463         else:
464             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
465
466         # token
467         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
468
469         # Decide which formats to download
470         req_format = self._downloader.params.get('format', None)
471
472         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473             self.report_rtmp_download()
474             video_url_list = [(None, video_info['conn'][0])]
475         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
480
481             format_limit = self._downloader.params.get('format_limit', None)
482             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483             if format_limit is not None and format_limit in available_formats:
484                 format_list = available_formats[available_formats.index(format_limit):]
485             else:
486                 format_list = available_formats
487             existing_formats = [x for x in format_list if x in url_map]
488             if len(existing_formats) == 0:
489                 self._downloader.trouble(u'ERROR: no known formats available for video')
490                 return
491             if self._downloader.params.get('listformats', None):
492                 self._print_formats(existing_formats)
493                 return
494             if req_format is None or req_format == 'best':
495                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496             elif req_format == 'worst':
497                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498             elif req_format in ('-1', 'all'):
499                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
500             else:
501                 # Specific formats. We pick the first in a slash-delimeted sequence.
502                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503                 req_formats = req_format.split('/')
504                 video_url_list = None
505                 for rf in req_formats:
506                     if rf in url_map:
507                         video_url_list = [(rf, url_map[rf])]
508                         break
509                 if video_url_list is None:
510                     self._downloader.trouble(u'ERROR: requested format not available')
511                     return
512         else:
513             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
514             return
515
516         results = []
517         for format_param, video_real_url in video_url_list:
518             # Extension
519             video_extension = self._video_extensions.get(format_param, 'flv')
520
521             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522                                               self._video_dimensions.get(format_param, '???'))
523
524             results.append({
525                 'id':       video_id,
526                 'url':      video_real_url,
527                 'uploader': video_uploader,
528                 'uploader_id': video_uploader_id,
529                 'upload_date':  upload_date,
530                 'title':    video_title,
531                 'ext':      video_extension,
532                 'format':   video_format,
533                 'thumbnail':    video_thumbnail,
534                 'description':  video_description,
535                 'player_url':   player_url,
536                 'subtitles':    video_subtitles,
537                 'duration':     video_duration
538             })
539         return results
540
541
542 class MetacafeIE(InfoExtractor):
543     """Information Extractor for metacafe.com."""
544
545     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548     IE_NAME = u'metacafe'
549
550     def __init__(self, downloader=None):
551         InfoExtractor.__init__(self, downloader)
552
553     def report_disclaimer(self):
554         """Report disclaimer retrieval."""
555         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
556
557     def report_age_confirmation(self):
558         """Report attempt to confirm age."""
559         self._downloader.to_screen(u'[metacafe] Confirming age')
560
561     def report_download_webpage(self, video_id):
562         """Report webpage download."""
563         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
564
565     def report_extraction(self, video_id):
566         """Report information extraction."""
567         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
568
569     def _real_initialize(self):
570         # Retrieve disclaimer
571         request = compat_urllib_request.Request(self._DISCLAIMER)
572         try:
573             self.report_disclaimer()
574             disclaimer = compat_urllib_request.urlopen(request).read()
575         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
577             return
578
579         # Confirm age
580         disclaimer_form = {
581             'filters': '0',
582             'submit': "Continue - I'm over 18",
583             }
584         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
585         try:
586             self.report_age_confirmation()
587             disclaimer = compat_urllib_request.urlopen(request).read()
588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
590             return
591
592     def _real_extract(self, url):
593         # Extract id and simplified title from URL
594         mobj = re.match(self._VALID_URL, url)
595         if mobj is None:
596             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
597             return
598
599         video_id = mobj.group(1)
600
601         # Check if video comes from YouTube
602         mobj2 = re.match(r'^yt-(.*)$', video_id)
603         if mobj2 is not None:
604             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
605             return
606
607         # Retrieve video webpage to extract further information
608         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
609         try:
610             self.report_download_webpage(video_id)
611             webpage = compat_urllib_request.urlopen(request).read()
612         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
614             return
615
616         # Extract URL, uploader and title from webpage
617         self.report_extraction(video_id)
618         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
619         if mobj is not None:
620             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621             video_extension = mediaURL[-3:]
622
623             # Extract gdaKey if available
624             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
625             if mobj is None:
626                 video_url = mediaURL
627             else:
628                 gdaKey = mobj.group(1)
629                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
630         else:
631             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
632             if mobj is None:
633                 self._downloader.trouble(u'ERROR: unable to extract media URL')
634                 return
635             vardict = compat_parse_qs(mobj.group(1))
636             if 'mediaData' not in vardict:
637                 self._downloader.trouble(u'ERROR: unable to extract media URL')
638                 return
639             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
640             if mobj is None:
641                 self._downloader.trouble(u'ERROR: unable to extract media URL')
642                 return
643             mediaURL = mobj.group(1).replace('\\/', '/')
644             video_extension = mediaURL[-3:]
645             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
646
647         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
648         if mobj is None:
649             self._downloader.trouble(u'ERROR: unable to extract title')
650             return
651         video_title = mobj.group(1).decode('utf-8')
652
653         mobj = re.search(r'submitter=(.*?);', webpage)
654         if mobj is None:
655             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656             return
657         video_uploader = mobj.group(1)
658
659         return [{
660             'id':       video_id.decode('utf-8'),
661             'url':      video_url.decode('utf-8'),
662             'uploader': video_uploader.decode('utf-8'),
663             'upload_date':  None,
664             'title':    video_title,
665             'ext':      video_extension.decode('utf-8'),
666         }]
667
668
669 class DailymotionIE(InfoExtractor):
670     """Information Extractor for Dailymotion"""
671
672     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673     IE_NAME = u'dailymotion'
674
675     def __init__(self, downloader=None):
676         InfoExtractor.__init__(self, downloader)
677
678     def report_download_webpage(self, video_id):
679         """Report webpage download."""
680         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
681
682     def report_extraction(self, video_id):
683         """Report information extraction."""
684         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
685
686     def _real_extract(self, url):
687         # Extract id and simplified title from URL
688         mobj = re.match(self._VALID_URL, url)
689         if mobj is None:
690             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
691             return
692
693         video_id = mobj.group(1).split('_')[0].split('?')[0]
694
695         video_extension = 'mp4'
696
697         # Retrieve video webpage to extract further information
698         request = compat_urllib_request.Request(url)
699         request.add_header('Cookie', 'family_filter=off')
700         try:
701             self.report_download_webpage(video_id)
702             webpage_bytes = compat_urllib_request.urlopen(request).read()
703             webpage = webpage_bytes.decode('utf-8')
704         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
705             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
706             return
707
708         # Extract URL, uploader and title from webpage
709         self.report_extraction(video_id)
710         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
711         if mobj is None:
712             self._downloader.trouble(u'ERROR: unable to extract media URL')
713             return
714         flashvars = compat_urllib_parse.unquote(mobj.group(1))
715
716         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
717             if key in flashvars:
718                 max_quality = key
719                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
720                 break
721         else:
722             self._downloader.trouble(u'ERROR: unable to extract video URL')
723             return
724
725         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
726         if mobj is None:
727             self._downloader.trouble(u'ERROR: unable to extract video URL')
728             return
729
730         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
731
732         # TODO: support choosing qualities
733
734         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
735         if mobj is None:
736             self._downloader.trouble(u'ERROR: unable to extract title')
737             return
738         video_title = unescapeHTML(mobj.group('title'))
739
740         video_uploader = None
741         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
742         if mobj is None:
743             # lookin for official user
744             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
745             if mobj_official is None:
746                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
747             else:
748                 video_uploader = mobj_official.group(1)
749         else:
750             video_uploader = mobj.group(1)
751
752         video_upload_date = None
753         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
754         if mobj is not None:
755             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
756
757         return [{
758             'id':       video_id,
759             'url':      video_url,
760             'uploader': video_uploader,
761             'upload_date':  video_upload_date,
762             'title':    video_title,
763             'ext':      video_extension,
764         }]
765
766
767 class PhotobucketIE(InfoExtractor):
768     """Information extractor for photobucket.com."""
769
770     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
771     IE_NAME = u'photobucket'
772
773     def __init__(self, downloader=None):
774         InfoExtractor.__init__(self, downloader)
775
776     def report_download_webpage(self, video_id):
777         """Report webpage download."""
778         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779
780     def report_extraction(self, video_id):
781         """Report information extraction."""
782         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783
784     def _real_extract(self, url):
785         # Extract id from URL
786         mobj = re.match(self._VALID_URL, url)
787         if mobj is None:
788             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
789             return
790
791         video_id = mobj.group(1)
792
793         video_extension = 'flv'
794
795         # Retrieve video webpage to extract further information
796         request = compat_urllib_request.Request(url)
797         try:
798             self.report_download_webpage(video_id)
799             webpage = compat_urllib_request.urlopen(request).read()
800         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
801             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
802             return
803
804         # Extract URL, uploader, and title from webpage
805         self.report_extraction(video_id)
806         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807         if mobj is None:
808             self._downloader.trouble(u'ERROR: unable to extract media URL')
809             return
810         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
811
812         video_url = mediaURL
813
814         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815         if mobj is None:
816             self._downloader.trouble(u'ERROR: unable to extract title')
817             return
818         video_title = mobj.group(1).decode('utf-8')
819
820         video_uploader = mobj.group(2).decode('utf-8')
821
822         return [{
823             'id':       video_id.decode('utf-8'),
824             'url':      video_url.decode('utf-8'),
825             'uploader': video_uploader,
826             'upload_date':  None,
827             'title':    video_title,
828             'ext':      video_extension.decode('utf-8'),
829         }]
830
831
832 class YahooIE(InfoExtractor):
833     """Information extractor for video.yahoo.com."""
834
835     _WORKING = False
836     # _VALID_URL matches all Yahoo! Video URLs
837     # _VPAGE_URL matches only the extractable '/watch/' URLs
838     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840     IE_NAME = u'video.yahoo'
841
842     def __init__(self, downloader=None):
843         InfoExtractor.__init__(self, downloader)
844
845     def report_download_webpage(self, video_id):
846         """Report webpage download."""
847         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
848
849     def report_extraction(self, video_id):
850         """Report information extraction."""
851         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
852
853     def _real_extract(self, url, new_video=True):
854         # Extract ID from URL
855         mobj = re.match(self._VALID_URL, url)
856         if mobj is None:
857             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
858             return
859
860         video_id = mobj.group(2)
861         video_extension = 'flv'
862
863         # Rewrite valid but non-extractable URLs as
864         # extractable English language /watch/ URLs
865         if re.match(self._VPAGE_URL, url) is None:
866             request = compat_urllib_request.Request(url)
867             try:
868                 webpage = compat_urllib_request.urlopen(request).read()
869             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
870                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
871                 return
872
873             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
874             if mobj is None:
875                 self._downloader.trouble(u'ERROR: Unable to extract id field')
876                 return
877             yahoo_id = mobj.group(1)
878
879             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
880             if mobj is None:
881                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
882                 return
883             yahoo_vid = mobj.group(1)
884
885             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886             return self._real_extract(url, new_video=False)
887
888         # Retrieve video webpage to extract further information
889         request = compat_urllib_request.Request(url)
890         try:
891             self.report_download_webpage(video_id)
892             webpage = compat_urllib_request.urlopen(request).read()
893         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
894             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
895             return
896
897         # Extract uploader and title from webpage
898         self.report_extraction(video_id)
899         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
900         if mobj is None:
901             self._downloader.trouble(u'ERROR: unable to extract video title')
902             return
903         video_title = mobj.group(1).decode('utf-8')
904
905         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
906         if mobj is None:
907             self._downloader.trouble(u'ERROR: unable to extract video uploader')
908             return
909         video_uploader = mobj.group(1).decode('utf-8')
910
911         # Extract video thumbnail
912         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
913         if mobj is None:
914             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
915             return
916         video_thumbnail = mobj.group(1).decode('utf-8')
917
918         # Extract video description
919         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
920         if mobj is None:
921             self._downloader.trouble(u'ERROR: unable to extract video description')
922             return
923         video_description = mobj.group(1).decode('utf-8')
924         if not video_description:
925             video_description = 'No description available.'
926
927         # Extract video height and width
928         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
929         if mobj is None:
930             self._downloader.trouble(u'ERROR: unable to extract video height')
931             return
932         yv_video_height = mobj.group(1)
933
934         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
935         if mobj is None:
936             self._downloader.trouble(u'ERROR: unable to extract video width')
937             return
938         yv_video_width = mobj.group(1)
939
940         # Retrieve video playlist to extract media URL
941         # I'm not completely sure what all these options are, but we
942         # seem to need most of them, otherwise the server sends a 401.
943         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
944         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
945         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
948         try:
949             self.report_download_webpage(video_id)
950             webpage = compat_urllib_request.urlopen(request).read()
951         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
952             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
953             return
954
955         # Extract media URL from playlist XML
956         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
957         if mobj is None:
958             self._downloader.trouble(u'ERROR: Unable to extract media URL')
959             return
960         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961         video_url = unescapeHTML(video_url)
962
963         return [{
964             'id':       video_id.decode('utf-8'),
965             'url':      video_url,
966             'uploader': video_uploader,
967             'upload_date':  None,
968             'title':    video_title,
969             'ext':      video_extension.decode('utf-8'),
970             'thumbnail':    video_thumbnail.decode('utf-8'),
971             'description':  video_description,
972         }]
973
974
975 class VimeoIE(InfoExtractor):
976     """Information extractor for vimeo.com."""
977
978     # _VALID_URL matches Vimeo URLs
979     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
980     IE_NAME = u'vimeo'
981
982     def __init__(self, downloader=None):
983         InfoExtractor.__init__(self, downloader)
984
985     def report_download_webpage(self, video_id):
986         """Report webpage download."""
987         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
988
989     def report_extraction(self, video_id):
990         """Report information extraction."""
991         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
992
993     def _real_extract(self, url, new_video=True):
994         # Extract ID from URL
995         mobj = re.match(self._VALID_URL, url)
996         if mobj is None:
997             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
998             return
999
1000         video_id = mobj.group(1)
1001
1002         # Retrieve video webpage to extract further information
1003         request = compat_urllib_request.Request(url, None, std_headers)
1004         try:
1005             self.report_download_webpage(video_id)
1006             webpage_bytes = compat_urllib_request.urlopen(request).read()
1007             webpage = webpage_bytes.decode('utf-8')
1008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1010             return
1011
1012         # Now we begin extracting as much information as we can from what we
1013         # retrieved. First we extract the information common to all extractors,
1014         # and latter we extract those that are Vimeo specific.
1015         self.report_extraction(video_id)
1016
1017         # Extract the config JSON
1018         try:
1019             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020             config = json.loads(config)
1021         except:
1022             self._downloader.trouble(u'ERROR: unable to extract info section')
1023             return
1024
1025         # Extract title
1026         video_title = config["video"]["title"]
1027
1028         # Extract uploader and uploader_id
1029         video_uploader = config["video"]["owner"]["name"]
1030         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1031
1032         # Extract video thumbnail
1033         video_thumbnail = config["video"]["thumbnail"]
1034
1035         # Extract video description
1036         video_description = get_element_by_attribute("itemprop", "description", webpage)
1037         if video_description: video_description = clean_html(video_description)
1038         else: video_description = ''
1039
1040         # Extract upload date
1041         video_upload_date = None
1042         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1043         if mobj is not None:
1044             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1045
1046         # Vimeo specific: extract request signature and timestamp
1047         sig = config['request']['signature']
1048         timestamp = config['request']['timestamp']
1049
1050         # Vimeo specific: extract video codec and quality information
1051         # First consider quality, then codecs, then take everything
1052         # TODO bind to format param
1053         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1054         files = { 'hd': [], 'sd': [], 'other': []}
1055         for codec_name, codec_extension in codecs:
1056             if codec_name in config["video"]["files"]:
1057                 if 'hd' in config["video"]["files"][codec_name]:
1058                     files['hd'].append((codec_name, codec_extension, 'hd'))
1059                 elif 'sd' in config["video"]["files"][codec_name]:
1060                     files['sd'].append((codec_name, codec_extension, 'sd'))
1061                 else:
1062                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1063
1064         for quality in ('hd', 'sd', 'other'):
1065             if len(files[quality]) > 0:
1066                 video_quality = files[quality][0][2]
1067                 video_codec = files[quality][0][0]
1068                 video_extension = files[quality][0][1]
1069                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1070                 break
1071         else:
1072             self._downloader.trouble(u'ERROR: no known codec found')
1073             return
1074
1075         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1076                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1077
1078         return [{
1079             'id':       video_id,
1080             'url':      video_url,
1081             'uploader': video_uploader,
1082             'uploader_id': video_uploader_id,
1083             'upload_date':  video_upload_date,
1084             'title':    video_title,
1085             'ext':      video_extension,
1086             'thumbnail':    video_thumbnail,
1087             'description':  video_description,
1088         }]
1089
1090
1091 class ArteTvIE(InfoExtractor):
1092     """arte.tv information extractor."""
1093
1094     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1095     _LIVE_URL = r'index-[0-9]+\.html$'
1096
1097     IE_NAME = u'arte.tv'
1098
1099     def __init__(self, downloader=None):
1100         InfoExtractor.__init__(self, downloader)
1101
1102     def report_download_webpage(self, video_id):
1103         """Report webpage download."""
1104         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1105
1106     def report_extraction(self, video_id):
1107         """Report information extraction."""
1108         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1109
1110     def fetch_webpage(self, url):
1111         request = compat_urllib_request.Request(url)
1112         try:
1113             self.report_download_webpage(url)
1114             webpage = compat_urllib_request.urlopen(request).read()
1115         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1116             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1117             return
1118         except ValueError as err:
1119             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120             return
1121         return webpage
1122
1123     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1124         page = self.fetch_webpage(url)
1125         mobj = re.search(regex, page, regexFlags)
1126         info = {}
1127
1128         if mobj is None:
1129             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1130             return
1131
1132         for (i, key, err) in matchTuples:
1133             if mobj.group(i) is None:
1134                 self._downloader.trouble(err)
1135                 return
1136             else:
1137                 info[key] = mobj.group(i)
1138
1139         return info
1140
1141     def extractLiveStream(self, url):
1142         video_lang = url.split('/')[-4]
1143         info = self.grep_webpage(
1144             url,
1145             r'src="(.*?/videothek_js.*?\.js)',
1146             0,
1147             [
1148                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1149             ]
1150         )
1151         http_host = url.split('/')[2]
1152         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1153         info = self.grep_webpage(
1154             next_url,
1155             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1156                 '(http://.*?\.swf).*?' +
1157                 '(rtmp://.*?)\'',
1158             re.DOTALL,
1159             [
1160                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1161                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1162                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1163             ]
1164         )
1165         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1166
1167     def extractPlus7Stream(self, url):
1168         video_lang = url.split('/')[-3]
1169         info = self.grep_webpage(
1170             url,
1171             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1172             0,
1173             [
1174                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1175             ]
1176         )
1177         next_url = compat_urllib_parse.unquote(info.get('url'))
1178         info = self.grep_webpage(
1179             next_url,
1180             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1181             0,
1182             [
1183                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1184             ]
1185         )
1186         next_url = compat_urllib_parse.unquote(info.get('url'))
1187
1188         info = self.grep_webpage(
1189             next_url,
1190             r'<video id="(.*?)".*?>.*?' +
1191                 '<name>(.*?)</name>.*?' +
1192                 '<dateVideo>(.*?)</dateVideo>.*?' +
1193                 '<url quality="hd">(.*?)</url>',
1194             re.DOTALL,
1195             [
1196                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1197                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1198                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1199                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1200             ]
1201         )
1202
1203         return {
1204             'id':           info.get('id'),
1205             'url':          compat_urllib_parse.unquote(info.get('url')),
1206             'uploader':     u'arte.tv',
1207             'upload_date':  info.get('date'),
1208             'title':        info.get('title').decode('utf-8'),
1209             'ext':          u'mp4',
1210             'format':       u'NA',
1211             'player_url':   None,
1212         }
1213
1214     def _real_extract(self, url):
1215         video_id = url.split('/')[-1]
1216         self.report_extraction(video_id)
1217
1218         if re.search(self._LIVE_URL, video_id) is not None:
1219             self.extractLiveStream(url)
1220             return
1221         else:
1222             info = self.extractPlus7Stream(url)
1223
1224         return [info]
1225
1226
1227 class GenericIE(InfoExtractor):
1228     """Generic last-resort information extractor."""
1229
1230     _VALID_URL = r'.*'
1231     IE_NAME = u'generic'
1232
1233     def __init__(self, downloader=None):
1234         InfoExtractor.__init__(self, downloader)
1235
1236     def report_download_webpage(self, video_id):
1237         """Report webpage download."""
1238         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1239         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1240
1241     def report_extraction(self, video_id):
1242         """Report information extraction."""
1243         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1244
1245     def report_following_redirect(self, new_url):
1246         """Report information extraction."""
1247         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1248
1249     def _test_redirect(self, url):
1250         """Check if it is a redirect, like url shorteners, in case restart chain."""
1251         class HeadRequest(compat_urllib_request.Request):
1252             def get_method(self):
1253                 return "HEAD"
1254
1255         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1256             """
1257             Subclass the HTTPRedirectHandler to make it use our
1258             HeadRequest also on the redirected URL
1259             """
1260             def redirect_request(self, req, fp, code, msg, headers, newurl):
1261                 if code in (301, 302, 303, 307):
1262                     newurl = newurl.replace(' ', '%20')
1263                     newheaders = dict((k,v) for k,v in req.headers.items()
1264                                       if k.lower() not in ("content-length", "content-type"))
1265                     return HeadRequest(newurl,
1266                                        headers=newheaders,
1267                                        origin_req_host=req.get_origin_req_host(),
1268                                        unverifiable=True)
1269                 else:
1270                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1271
1272         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1273             """
1274             Fallback to GET if HEAD is not allowed (405 HTTP error)
1275             """
1276             def http_error_405(self, req, fp, code, msg, headers):
1277                 fp.read()
1278                 fp.close()
1279
1280                 newheaders = dict((k,v) for k,v in req.headers.items()
1281                                   if k.lower() not in ("content-length", "content-type"))
1282                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1283                                                  headers=newheaders,
1284                                                  origin_req_host=req.get_origin_req_host(),
1285                                                  unverifiable=True))
1286
1287         # Build our opener
1288         opener = compat_urllib_request.OpenerDirector()
1289         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1290                         HTTPMethodFallback, HEADRedirectHandler,
1291                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1292             opener.add_handler(handler())
1293
1294         response = opener.open(HeadRequest(url))
1295         new_url = response.geturl()
1296
1297         if url == new_url:
1298             return False
1299
1300         self.report_following_redirect(new_url)
1301         self._downloader.download([new_url])
1302         return True
1303
1304     def _real_extract(self, url):
1305         if self._test_redirect(url): return
1306
1307         video_id = url.split('/')[-1]
1308         request = compat_urllib_request.Request(url)
1309         try:
1310             self.report_download_webpage(video_id)
1311             webpage = compat_urllib_request.urlopen(request).read()
1312         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1313             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1314             return
1315         except ValueError as err:
1316             # since this is the last-resort InfoExtractor, if
1317             # this error is thrown, it'll be thrown here
1318             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1319             return
1320
1321         self.report_extraction(video_id)
1322         # Start with something easy: JW Player in SWFObject
1323         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1324         if mobj is None:
1325             # Broaden the search a little bit
1326             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1327         if mobj is None:
1328             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1329             return
1330
1331         # It's possible that one of the regexes
1332         # matched, but returned an empty group:
1333         if mobj.group(1) is None:
1334             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1335             return
1336
1337         video_url = compat_urllib_parse.unquote(mobj.group(1))
1338         video_id = os.path.basename(video_url)
1339
1340         # here's a fun little line of code for you:
1341         video_extension = os.path.splitext(video_id)[1][1:]
1342         video_id = os.path.splitext(video_id)[0]
1343
1344         # it's tempting to parse this further, but you would
1345         # have to take into account all the variations like
1346         #   Video Title - Site Name
1347         #   Site Name | Video Title
1348         #   Video Title - Tagline | Site Name
1349         # and so on and so forth; it's just not practical
1350         mobj = re.search(r'<title>(.*)</title>', webpage)
1351         if mobj is None:
1352             self._downloader.trouble(u'ERROR: unable to extract title')
1353             return
1354         video_title = mobj.group(1)
1355
1356         # video uploader is domain name
1357         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1358         if mobj is None:
1359             self._downloader.trouble(u'ERROR: unable to extract title')
1360             return
1361         video_uploader = mobj.group(1)
1362
1363         return [{
1364             'id':       video_id,
1365             'url':      video_url,
1366             'uploader': video_uploader,
1367             'upload_date':  None,
1368             'title':    video_title,
1369             'ext':      video_extension,
1370         }]
1371
1372
1373 class YoutubeSearchIE(InfoExtractor):
1374     """Information Extractor for YouTube search queries."""
1375     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1376     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1377     _max_youtube_results = 1000
1378     IE_NAME = u'youtube:search'
1379
1380     def __init__(self, downloader=None):
1381         InfoExtractor.__init__(self, downloader)
1382
1383     def report_download_page(self, query, pagenum):
1384         """Report attempt to download search page with given number."""
1385         query = query.decode(preferredencoding())
1386         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1387
1388     def _real_extract(self, query):
1389         mobj = re.match(self._VALID_URL, query)
1390         if mobj is None:
1391             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1392             return
1393
1394         prefix, query = query.split(':')
1395         prefix = prefix[8:]
1396         query = query.encode('utf-8')
1397         if prefix == '':
1398             self._download_n_results(query, 1)
1399             return
1400         elif prefix == 'all':
1401             self._download_n_results(query, self._max_youtube_results)
1402             return
1403         else:
1404             try:
1405                 n = int(prefix)
1406                 if n <= 0:
1407                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1408                     return
1409                 elif n > self._max_youtube_results:
1410                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1411                     n = self._max_youtube_results
1412                 self._download_n_results(query, n)
1413                 return
1414             except ValueError: # parsing prefix as integer fails
1415                 self._download_n_results(query, 1)
1416                 return
1417
1418     def _download_n_results(self, query, n):
1419         """Downloads a specified number of results for a query"""
1420
1421         video_ids = []
1422         pagenum = 0
1423         limit = n
1424
1425         while (50 * pagenum) < limit:
1426             self.report_download_page(query, pagenum+1)
1427             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1428             request = compat_urllib_request.Request(result_url)
1429             try:
1430                 data = compat_urllib_request.urlopen(request).read()
1431             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1432                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1433                 return
1434             api_response = json.loads(data)['data']
1435
1436             new_ids = list(video['id'] for video in api_response['items'])
1437             video_ids += new_ids
1438
1439             limit = min(n, api_response['totalItems'])
1440             pagenum += 1
1441
1442         if len(video_ids) > n:
1443             video_ids = video_ids[:n]
1444         for id in video_ids:
1445             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1446         return
1447
1448
1449 class GoogleSearchIE(InfoExtractor):
1450     """Information Extractor for Google Video search queries."""
1451     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1452     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1453     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1454     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1455     _max_google_results = 1000
1456     IE_NAME = u'video.google:search'
1457
1458     def __init__(self, downloader=None):
1459         InfoExtractor.__init__(self, downloader)
1460
1461     def report_download_page(self, query, pagenum):
1462         """Report attempt to download playlist page with given number."""
1463         query = query.decode(preferredencoding())
1464         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1465
1466     def _real_extract(self, query):
1467         mobj = re.match(self._VALID_URL, query)
1468         if mobj is None:
1469             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1470             return
1471
1472         prefix, query = query.split(':')
1473         prefix = prefix[8:]
1474         query = query.encode('utf-8')
1475         if prefix == '':
1476             self._download_n_results(query, 1)
1477             return
1478         elif prefix == 'all':
1479             self._download_n_results(query, self._max_google_results)
1480             return
1481         else:
1482             try:
1483                 n = int(prefix)
1484                 if n <= 0:
1485                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1486                     return
1487                 elif n > self._max_google_results:
1488                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1489                     n = self._max_google_results
1490                 self._download_n_results(query, n)
1491                 return
1492             except ValueError: # parsing prefix as integer fails
1493                 self._download_n_results(query, 1)
1494                 return
1495
1496     def _download_n_results(self, query, n):
1497         """Downloads a specified number of results for a query"""
1498
1499         video_ids = []
1500         pagenum = 0
1501
1502         while True:
1503             self.report_download_page(query, pagenum)
1504             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1505             request = compat_urllib_request.Request(result_url)
1506             try:
1507                 page = compat_urllib_request.urlopen(request).read()
1508             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1509                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1510                 return
1511
1512             # Extract video identifiers
1513             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514                 video_id = mobj.group(1)
1515                 if video_id not in video_ids:
1516                     video_ids.append(video_id)
1517                     if len(video_ids) == n:
1518                         # Specified n videos reached
1519                         for id in video_ids:
1520                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1521                         return
1522
1523             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524                 for id in video_ids:
1525                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1526                 return
1527
1528             pagenum = pagenum + 1
1529
1530
1531 class YahooSearchIE(InfoExtractor):
1532     """Information Extractor for Yahoo! Video search queries."""
1533
1534     _WORKING = False
1535     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1536     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1537     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1538     _MORE_PAGES_INDICATOR = r'\s*Next'
1539     _max_yahoo_results = 1000
1540     IE_NAME = u'video.yahoo:search'
1541
1542     def __init__(self, downloader=None):
1543         InfoExtractor.__init__(self, downloader)
1544
1545     def report_download_page(self, query, pagenum):
1546         """Report attempt to download playlist page with given number."""
1547         query = query.decode(preferredencoding())
1548         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1549
1550     def _real_extract(self, query):
1551         mobj = re.match(self._VALID_URL, query)
1552         if mobj is None:
1553             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1554             return
1555
1556         prefix, query = query.split(':')
1557         prefix = prefix[8:]
1558         query = query.encode('utf-8')
1559         if prefix == '':
1560             self._download_n_results(query, 1)
1561             return
1562         elif prefix == 'all':
1563             self._download_n_results(query, self._max_yahoo_results)
1564             return
1565         else:
1566             try:
1567                 n = int(prefix)
1568                 if n <= 0:
1569                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1570                     return
1571                 elif n > self._max_yahoo_results:
1572                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1573                     n = self._max_yahoo_results
1574                 self._download_n_results(query, n)
1575                 return
1576             except ValueError: # parsing prefix as integer fails
1577                 self._download_n_results(query, 1)
1578                 return
1579
1580     def _download_n_results(self, query, n):
1581         """Downloads a specified number of results for a query"""
1582
1583         video_ids = []
1584         already_seen = set()
1585         pagenum = 1
1586
1587         while True:
1588             self.report_download_page(query, pagenum)
1589             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1590             request = compat_urllib_request.Request(result_url)
1591             try:
1592                 page = compat_urllib_request.urlopen(request).read()
1593             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1594                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1595                 return
1596
1597             # Extract video identifiers
1598             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1599                 video_id = mobj.group(1)
1600                 if video_id not in already_seen:
1601                     video_ids.append(video_id)
1602                     already_seen.add(video_id)
1603                     if len(video_ids) == n:
1604                         # Specified n videos reached
1605                         for id in video_ids:
1606                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1607                         return
1608
1609             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610                 for id in video_ids:
1611                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1612                 return
1613
1614             pagenum = pagenum + 1
1615
1616
1617 class YoutubePlaylistIE(InfoExtractor):
1618     """Information Extractor for YouTube playlists."""
1619
1620     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1621     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1622     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1623     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1624     IE_NAME = u'youtube:playlist'
1625
1626     def __init__(self, downloader=None):
1627         InfoExtractor.__init__(self, downloader)
1628
1629     def report_download_page(self, playlist_id, pagenum):
1630         """Report attempt to download playlist page with given number."""
1631         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1632
1633     def _real_extract(self, url):
1634         # Extract playlist id
1635         mobj = re.match(self._VALID_URL, url)
1636         if mobj is None:
1637             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1638             return
1639
1640         # Single video case
1641         if mobj.group(3) is not None:
1642             self._downloader.download([mobj.group(3)])
1643             return
1644
1645         # Download playlist pages
1646         # prefix is 'p' as default for playlists but there are other types that need extra care
1647         playlist_prefix = mobj.group(1)
1648         if playlist_prefix == 'a':
1649             playlist_access = 'artist'
1650         else:
1651             playlist_prefix = 'p'
1652             playlist_access = 'view_play_list'
1653         playlist_id = mobj.group(2)
1654         video_ids = []
1655         pagenum = 1
1656
1657         while True:
1658             self.report_download_page(playlist_id, pagenum)
1659             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1660             request = compat_urllib_request.Request(url)
1661             try:
1662                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1663             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1664                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1665                 return
1666
1667             # Extract video identifiers
1668             ids_in_page = []
1669             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1670                 if mobj.group(1) not in ids_in_page:
1671                     ids_in_page.append(mobj.group(1))
1672             video_ids.extend(ids_in_page)
1673
1674             if self._MORE_PAGES_INDICATOR not in page:
1675                 break
1676             pagenum = pagenum + 1
1677
1678         total = len(video_ids)
1679
1680         playliststart = self._downloader.params.get('playliststart', 1) - 1
1681         playlistend = self._downloader.params.get('playlistend', -1)
1682         if playlistend == -1:
1683             video_ids = video_ids[playliststart:]
1684         else:
1685             video_ids = video_ids[playliststart:playlistend]
1686
1687         if len(video_ids) == total:
1688             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1689         else:
1690             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1691
1692         for id in video_ids:
1693             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1694         return
1695
1696
1697 class YoutubeChannelIE(InfoExtractor):
1698     """Information Extractor for YouTube channels."""
1699
1700     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1701     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1702     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1703     IE_NAME = u'youtube:channel'
1704
1705     def report_download_page(self, channel_id, pagenum):
1706         """Report attempt to download channel page with given number."""
1707         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1708
1709     def _real_extract(self, url):
1710         # Extract channel id
1711         mobj = re.match(self._VALID_URL, url)
1712         if mobj is None:
1713             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1714             return
1715
1716         # Download channel pages
1717         channel_id = mobj.group(1)
1718         video_ids = []
1719         pagenum = 1
1720
1721         while True:
1722             self.report_download_page(channel_id, pagenum)
1723             url = self._TEMPLATE_URL % (channel_id, pagenum)
1724             request = compat_urllib_request.Request(url)
1725             try:
1726                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1727             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1728                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1729                 return
1730
1731             # Extract video identifiers
1732             ids_in_page = []
1733             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1734                 if mobj.group(1) not in ids_in_page:
1735                     ids_in_page.append(mobj.group(1))
1736             video_ids.extend(ids_in_page)
1737
1738             if self._MORE_PAGES_INDICATOR not in page:
1739                 break
1740             pagenum = pagenum + 1
1741
1742         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1743
1744         for id in video_ids:
1745             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746         return
1747
1748
1749 class YoutubeUserIE(InfoExtractor):
1750     """Information Extractor for YouTube users."""
1751
1752     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1753     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1754     _GDATA_PAGE_SIZE = 50
1755     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1756     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1757     IE_NAME = u'youtube:user'
1758
1759     def __init__(self, downloader=None):
1760         InfoExtractor.__init__(self, downloader)
1761
1762     def report_download_page(self, username, start_index):
1763         """Report attempt to download user page."""
1764         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1765                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1766
1767     def _real_extract(self, url):
1768         # Extract username
1769         mobj = re.match(self._VALID_URL, url)
1770         if mobj is None:
1771             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1772             return
1773
1774         username = mobj.group(1)
1775
1776         # Download video ids using YouTube Data API. Result size per
1777         # query is limited (currently to 50 videos) so we need to query
1778         # page by page until there are no video ids - it means we got
1779         # all of them.
1780
1781         video_ids = []
1782         pagenum = 0
1783
1784         while True:
1785             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1786             self.report_download_page(username, start_index)
1787
1788             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1789
1790             try:
1791                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794                 return
1795
1796             # Extract video identifiers
1797             ids_in_page = []
1798
1799             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1800                 if mobj.group(1) not in ids_in_page:
1801                     ids_in_page.append(mobj.group(1))
1802
1803             video_ids.extend(ids_in_page)
1804
1805             # A little optimization - if current page is not
1806             # "full", ie. does not contain PAGE_SIZE video ids then
1807             # we can assume that this page is the last one - there
1808             # are no more ids on further pages - no need to query
1809             # again.
1810
1811             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1812                 break
1813
1814             pagenum += 1
1815
1816         all_ids_count = len(video_ids)
1817         playliststart = self._downloader.params.get('playliststart', 1) - 1
1818         playlistend = self._downloader.params.get('playlistend', -1)
1819
1820         if playlistend == -1:
1821             video_ids = video_ids[playliststart:]
1822         else:
1823             video_ids = video_ids[playliststart:playlistend]
1824
1825         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1826                 (username, all_ids_count, len(video_ids)))
1827
1828         for video_id in video_ids:
1829             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1830
1831
1832 class BlipTVUserIE(InfoExtractor):
1833     """Information Extractor for blip.tv users."""
1834
1835     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1836     _PAGE_SIZE = 12
1837     IE_NAME = u'blip.tv:user'
1838
1839     def __init__(self, downloader=None):
1840         InfoExtractor.__init__(self, downloader)
1841
1842     def report_download_page(self, username, pagenum):
1843         """Report attempt to download user page."""
1844         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1845                 (self.IE_NAME, username, pagenum))
1846
1847     def _real_extract(self, url):
1848         # Extract username
1849         mobj = re.match(self._VALID_URL, url)
1850         if mobj is None:
1851             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1852             return
1853
1854         username = mobj.group(1)
1855
1856         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1857
1858         request = compat_urllib_request.Request(url)
1859
1860         try:
1861             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1862             mobj = re.search(r'data-users-id="([^"]+)"', page)
1863             page_base = page_base % mobj.group(1)
1864         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1866             return
1867
1868
1869         # Download video ids using BlipTV Ajax calls. Result size per
1870         # query is limited (currently to 12 videos) so we need to query
1871         # page by page until there are no video ids - it means we got
1872         # all of them.
1873
1874         video_ids = []
1875         pagenum = 1
1876
1877         while True:
1878             self.report_download_page(username, pagenum)
1879
1880             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1881
1882             try:
1883                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1884             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1885                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1886                 return
1887
1888             # Extract video identifiers
1889             ids_in_page = []
1890
1891             for mobj in re.finditer(r'href="/([^"]+)"', page):
1892                 if mobj.group(1) not in ids_in_page:
1893                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1894
1895             video_ids.extend(ids_in_page)
1896
1897             # A little optimization - if current page is not
1898             # "full", ie. does not contain PAGE_SIZE video ids then
1899             # we can assume that this page is the last one - there
1900             # are no more ids on further pages - no need to query
1901             # again.
1902
1903             if len(ids_in_page) < self._PAGE_SIZE:
1904                 break
1905
1906             pagenum += 1
1907
1908         all_ids_count = len(video_ids)
1909         playliststart = self._downloader.params.get('playliststart', 1) - 1
1910         playlistend = self._downloader.params.get('playlistend', -1)
1911
1912         if playlistend == -1:
1913             video_ids = video_ids[playliststart:]
1914         else:
1915             video_ids = video_ids[playliststart:playlistend]
1916
1917         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1918                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1919
1920         for video_id in video_ids:
1921             self._downloader.download([u'http://blip.tv/'+video_id])
1922
1923
1924 class DepositFilesIE(InfoExtractor):
1925     """Information extractor for depositfiles.com"""
1926
1927     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1928
1929     def report_download_webpage(self, file_id):
1930         """Report webpage download."""
1931         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1932
1933     def report_extraction(self, file_id):
1934         """Report information extraction."""
1935         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1936
1937     def _real_extract(self, url):
1938         file_id = url.split('/')[-1]
1939         # Rebuild url in english locale
1940         url = 'http://depositfiles.com/en/files/' + file_id
1941
1942         # Retrieve file webpage with 'Free download' button pressed
1943         free_download_indication = { 'gateway_result' : '1' }
1944         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1945         try:
1946             self.report_download_webpage(file_id)
1947             webpage = compat_urllib_request.urlopen(request).read()
1948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1950             return
1951
1952         # Search for the real file URL
1953         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1954         if (mobj is None) or (mobj.group(1) is None):
1955             # Try to figure out reason of the error.
1956             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1957             if (mobj is not None) and (mobj.group(1) is not None):
1958                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1959                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1960             else:
1961                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1962             return
1963
1964         file_url = mobj.group(1)
1965         file_extension = os.path.splitext(file_url)[1][1:]
1966
1967         # Search for file title
1968         mobj = re.search(r'<b title="(.*?)">', webpage)
1969         if mobj is None:
1970             self._downloader.trouble(u'ERROR: unable to extract title')
1971             return
1972         file_title = mobj.group(1).decode('utf-8')
1973
1974         return [{
1975             'id':       file_id.decode('utf-8'),
1976             'url':      file_url.decode('utf-8'),
1977             'uploader': None,
1978             'upload_date':  None,
1979             'title':    file_title,
1980             'ext':      file_extension.decode('utf-8'),
1981         }]
1982
1983
1984 class FacebookIE(InfoExtractor):
1985     """Information Extractor for Facebook"""
1986
1987     _WORKING = False
1988     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1989     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1990     _NETRC_MACHINE = 'facebook'
1991     _available_formats = ['video', 'highqual', 'lowqual']
1992     _video_extensions = {
1993         'video': 'mp4',
1994         'highqual': 'mp4',
1995         'lowqual': 'mp4',
1996     }
1997     IE_NAME = u'facebook'
1998
1999     def __init__(self, downloader=None):
2000         InfoExtractor.__init__(self, downloader)
2001
2002     def _reporter(self, message):
2003         """Add header and report message."""
2004         self._downloader.to_screen(u'[facebook] %s' % message)
2005
2006     def report_login(self):
2007         """Report attempt to log in."""
2008         self._reporter(u'Logging in')
2009
2010     def report_video_webpage_download(self, video_id):
2011         """Report attempt to download video webpage."""
2012         self._reporter(u'%s: Downloading video webpage' % video_id)
2013
2014     def report_information_extraction(self, video_id):
2015         """Report attempt to extract video information."""
2016         self._reporter(u'%s: Extracting video information' % video_id)
2017
2018     def _parse_page(self, video_webpage):
2019         """Extract video information from page"""
2020         # General data
2021         data = {'title': r'\("video_title", "(.*?)"\)',
2022             'description': r'<div class="datawrap">(.*?)</div>',
2023             'owner': r'\("video_owner_name", "(.*?)"\)',
2024             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2025             }
2026         video_info = {}
2027         for piece in data.keys():
2028             mobj = re.search(data[piece], video_webpage)
2029             if mobj is not None:
2030                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2031
2032         # Video urls
2033         video_urls = {}
2034         for fmt in self._available_formats:
2035             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2036             if mobj is not None:
2037                 # URL is in a Javascript segment inside an escaped Unicode format within
2038                 # the generally utf-8 page
2039                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2040         video_info['video_urls'] = video_urls
2041
2042         return video_info
2043
2044     def _real_initialize(self):
2045         if self._downloader is None:
2046             return
2047
2048         useremail = None
2049         password = None
2050         downloader_params = self._downloader.params
2051
2052         # Attempt to use provided username and password or .netrc data
2053         if downloader_params.get('username', None) is not None:
2054             useremail = downloader_params['username']
2055             password = downloader_params['password']
2056         elif downloader_params.get('usenetrc', False):
2057             try:
2058                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2059                 if info is not None:
2060                     useremail = info[0]
2061                     password = info[2]
2062                 else:
2063                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2064             except (IOError, netrc.NetrcParseError) as err:
2065                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2066                 return
2067
2068         if useremail is None:
2069             return
2070
2071         # Log in
2072         login_form = {
2073             'email': useremail,
2074             'pass': password,
2075             'login': 'Log+In'
2076             }
2077         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2078         try:
2079             self.report_login()
2080             login_results = compat_urllib_request.urlopen(request).read()
2081             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2082                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2083                 return
2084         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2085             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2086             return
2087
2088     def _real_extract(self, url):
2089         mobj = re.match(self._VALID_URL, url)
2090         if mobj is None:
2091             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2092             return
2093         video_id = mobj.group('ID')
2094
2095         # Get video webpage
2096         self.report_video_webpage_download(video_id)
2097         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2098         try:
2099             page = compat_urllib_request.urlopen(request)
2100             video_webpage = page.read()
2101         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2102             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2103             return
2104
2105         # Start extracting information
2106         self.report_information_extraction(video_id)
2107
2108         # Extract information
2109         video_info = self._parse_page(video_webpage)
2110
2111         # uploader
2112         if 'owner' not in video_info:
2113             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2114             return
2115         video_uploader = video_info['owner']
2116
2117         # title
2118         if 'title' not in video_info:
2119             self._downloader.trouble(u'ERROR: unable to extract video title')
2120             return
2121         video_title = video_info['title']
2122         video_title = video_title.decode('utf-8')
2123
2124         # thumbnail image
2125         if 'thumbnail' not in video_info:
2126             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2127             video_thumbnail = ''
2128         else:
2129             video_thumbnail = video_info['thumbnail']
2130
2131         # upload date
2132         upload_date = None
2133         if 'upload_date' in video_info:
2134             upload_time = video_info['upload_date']
2135             timetuple = email.utils.parsedate_tz(upload_time)
2136             if timetuple is not None:
2137                 try:
2138                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2139                 except:
2140                     pass
2141
2142         # description
2143         video_description = video_info.get('description', 'No description available.')
2144
2145         url_map = video_info['video_urls']
2146         if url_map:
2147             # Decide which formats to download
2148             req_format = self._downloader.params.get('format', None)
2149             format_limit = self._downloader.params.get('format_limit', None)
2150
2151             if format_limit is not None and format_limit in self._available_formats:
2152                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2153             else:
2154                 format_list = self._available_formats
2155             existing_formats = [x for x in format_list if x in url_map]
2156             if len(existing_formats) == 0:
2157                 self._downloader.trouble(u'ERROR: no known formats available for video')
2158                 return
2159             if req_format is None:
2160                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2161             elif req_format == 'worst':
2162                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2163             elif req_format == '-1':
2164                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2165             else:
2166                 # Specific format
2167                 if req_format not in url_map:
2168                     self._downloader.trouble(u'ERROR: requested format not available')
2169                     return
2170                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2171
2172         results = []
2173         for format_param, video_real_url in video_url_list:
2174             # Extension
2175             video_extension = self._video_extensions.get(format_param, 'mp4')
2176
2177             results.append({
2178                 'id':       video_id.decode('utf-8'),
2179                 'url':      video_real_url.decode('utf-8'),
2180                 'uploader': video_uploader.decode('utf-8'),
2181                 'upload_date':  upload_date,
2182                 'title':    video_title,
2183                 'ext':      video_extension.decode('utf-8'),
2184                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2185                 'thumbnail':    video_thumbnail.decode('utf-8'),
2186                 'description':  video_description.decode('utf-8'),
2187             })
2188         return results
2189
2190 class BlipTVIE(InfoExtractor):
2191     """Information extractor for blip.tv"""
2192
2193     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195     IE_NAME = u'blip.tv'
2196
2197     def report_extraction(self, file_id):
2198         """Report information extraction."""
2199         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200
2201     def report_direct_download(self, title):
2202         """Report information extraction."""
2203         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204
2205     def _real_extract(self, url):
2206         mobj = re.match(self._VALID_URL, url)
2207         if mobj is None:
2208             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2209             return
2210
2211         if '?' in url:
2212             cchar = '&'
2213         else:
2214             cchar = '?'
2215         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2216         request = compat_urllib_request.Request(json_url)
2217         self.report_extraction(mobj.group(1))
2218         info = None
2219         try:
2220             urlh = compat_urllib_request.urlopen(request)
2221             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222                 basename = url.split('/')[-1]
2223                 title,ext = os.path.splitext(basename)
2224                 title = title.decode('UTF-8')
2225                 ext = ext.replace('.', '')
2226                 self.report_direct_download(title)
2227                 info = {
2228                     'id': title,
2229                     'url': url,
2230                     'uploader': None,
2231                     'upload_date': None,
2232                     'title': title,
2233                     'ext': ext,
2234                     'urlhandle': urlh
2235                 }
2236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238             return
2239         if info is None: # Regular URL
2240             try:
2241                 json_code_bytes = urlh.read()
2242                 json_code = json_code_bytes.decode('utf-8')
2243             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2244                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2245                 return
2246
2247             try:
2248                 json_data = json.loads(json_code)
2249                 if 'Post' in json_data:
2250                     data = json_data['Post']
2251                 else:
2252                     data = json_data
2253
2254                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2255                 video_url = data['media']['url']
2256                 umobj = re.match(self._URL_EXT, video_url)
2257                 if umobj is None:
2258                     raise ValueError('Can not determine filename extension')
2259                 ext = umobj.group(1)
2260
2261                 info = {
2262                     'id': data['item_id'],
2263                     'url': video_url,
2264                     'uploader': data['display_name'],
2265                     'upload_date': upload_date,
2266                     'title': data['title'],
2267                     'ext': ext,
2268                     'format': data['media']['mimeType'],
2269                     'thumbnail': data['thumbnailUrl'],
2270                     'description': data['description'],
2271                     'player_url': data['embedUrl']
2272                 }
2273             except (ValueError,KeyError) as err:
2274                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2275                 return
2276
2277         std_headers['User-Agent'] = 'iTunes/10.6.1'
2278         return [info]
2279
2280
2281 class MyVideoIE(InfoExtractor):
2282     """Information Extractor for myvideo.de."""
2283
2284     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2285     IE_NAME = u'myvideo'
2286
2287     def __init__(self, downloader=None):
2288         InfoExtractor.__init__(self, downloader)
2289
2290     def report_extraction(self, video_id):
2291         """Report information extraction."""
2292         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2293
2294     def _real_extract(self,url):
2295         mobj = re.match(self._VALID_URL, url)
2296         if mobj is None:
2297             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2298             return
2299
2300         video_id = mobj.group(1)
2301
2302         # Get video webpage
2303         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2304         webpage = self._download_webpage(webpage_url, video_id)
2305
2306         self.report_extraction(video_id)
2307         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2308                  webpage)
2309         if mobj is None:
2310             self._downloader.trouble(u'ERROR: unable to extract media URL')
2311             return
2312         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2313
2314         mobj = re.search('<title>([^<]+)</title>', webpage)
2315         if mobj is None:
2316             self._downloader.trouble(u'ERROR: unable to extract title')
2317             return
2318
2319         video_title = mobj.group(1)
2320
2321         return [{
2322             'id':       video_id,
2323             'url':      video_url,
2324             'uploader': None,
2325             'upload_date':  None,
2326             'title':    video_title,
2327             'ext':      u'flv',
2328         }]
2329
2330 class ComedyCentralIE(InfoExtractor):
2331     """Information extractor for The Daily Show and Colbert Report """
2332
2333     # urls can be abbreviations like :thedailyshow or :colbert
2334     # urls for episodes like:
2335     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2336     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2337     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2338     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2339                       |(https?://)?(www\.)?
2340                           (?P<showname>thedailyshow|colbertnation)\.com/
2341                          (full-episodes/(?P<episode>.*)|
2342                           (?P<clip>
2343                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2344                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2345                      $"""
2346     IE_NAME = u'comedycentral'
2347
2348     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349
2350     _video_extensions = {
2351         '3500': 'mp4',
2352         '2200': 'mp4',
2353         '1700': 'mp4',
2354         '1200': 'mp4',
2355         '750': 'mp4',
2356         '400': 'mp4',
2357     }
2358     _video_dimensions = {
2359         '3500': '1280x720',
2360         '2200': '960x540',
2361         '1700': '768x432',
2362         '1200': '640x360',
2363         '750': '512x288',
2364         '400': '384x216',
2365     }
2366
2367     def suitable(self, url):
2368         """Receives a URL and returns True if suitable for this IE."""
2369         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2370
2371     def report_extraction(self, episode_id):
2372         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2373
2374     def report_config_download(self, episode_id):
2375         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2376
2377     def report_index_download(self, episode_id):
2378         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2379
2380     def report_player_url(self, episode_id):
2381         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2382
2383
2384     def _print_formats(self, formats):
2385         print('Available formats:')
2386         for x in formats:
2387             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2388
2389
2390     def _real_extract(self, url):
2391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392         if mobj is None:
2393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2394             return
2395
2396         if mobj.group('shortname'):
2397             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2398                 url = u'http://www.thedailyshow.com/full-episodes/'
2399             else:
2400                 url = u'http://www.colbertnation.com/full-episodes/'
2401             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2402             assert mobj is not None
2403
2404         if mobj.group('clip'):
2405             if mobj.group('showname') == 'thedailyshow':
2406                 epTitle = mobj.group('tdstitle')
2407             else:
2408                 epTitle = mobj.group('cntitle')
2409             dlNewest = False
2410         else:
2411             dlNewest = not mobj.group('episode')
2412             if dlNewest:
2413                 epTitle = mobj.group('showname')
2414             else:
2415                 epTitle = mobj.group('episode')
2416
2417         req = compat_urllib_request.Request(url)
2418         self.report_extraction(epTitle)
2419         try:
2420             htmlHandle = compat_urllib_request.urlopen(req)
2421             html = htmlHandle.read()
2422         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2424             return
2425         if dlNewest:
2426             url = htmlHandle.geturl()
2427             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2428             if mobj is None:
2429                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2430                 return
2431             if mobj.group('episode') == '':
2432                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2433                 return
2434             epTitle = mobj.group('episode')
2435
2436         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2437
2438         if len(mMovieParams) == 0:
2439             # The Colbert Report embeds the information in a without
2440             # a URL prefix; so extract the alternate reference
2441             # and then add the URL prefix manually.
2442
2443             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2444             if len(altMovieParams) == 0:
2445                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2446                 return
2447             else:
2448                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2449
2450         playerUrl_raw = mMovieParams[0][0]
2451         self.report_player_url(epTitle)
2452         try:
2453             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2454             playerUrl = urlHandle.geturl()
2455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2456             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2457             return
2458
2459         uri = mMovieParams[0][1]
2460         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2461         self.report_index_download(epTitle)
2462         try:
2463             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2464         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2466             return
2467
2468         results = []
2469
2470         idoc = xml.etree.ElementTree.fromstring(indexXml)
2471         itemEls = idoc.findall('.//item')
2472         for itemEl in itemEls:
2473             mediaId = itemEl.findall('./guid')[0].text
2474             shortMediaId = mediaId.split(':')[-1]
2475             showId = mediaId.split(':')[-2].replace('.com', '')
2476             officialTitle = itemEl.findall('./title')[0].text
2477             officialDate = itemEl.findall('./pubDate')[0].text
2478
2479             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2480                         compat_urllib_parse.urlencode({'uri': mediaId}))
2481             configReq = compat_urllib_request.Request(configUrl)
2482             self.report_config_download(epTitle)
2483             try:
2484                 configXml = compat_urllib_request.urlopen(configReq).read()
2485             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2487                 return
2488
2489             cdoc = xml.etree.ElementTree.fromstring(configXml)
2490             turls = []
2491             for rendition in cdoc.findall('.//rendition'):
2492                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2493                 turls.append(finfo)
2494
2495             if len(turls) == 0:
2496                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2497                 continue
2498
2499             if self._downloader.params.get('listformats', None):
2500                 self._print_formats([i[0] for i in turls])
2501                 return
2502
2503             # For now, just pick the highest bitrate
2504             format,video_url = turls[-1]
2505
2506             # Get the format arg from the arg stream
2507             req_format = self._downloader.params.get('format', None)
2508
2509             # Select format if we can find one
2510             for f,v in turls:
2511                 if f == req_format:
2512                     format, video_url = f, v
2513                     break
2514
2515             # Patch to download from alternative CDN, which does not
2516             # break on current RTMPDump builds
2517             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2518             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2519
2520             if video_url.startswith(broken_cdn):
2521                 video_url = video_url.replace(broken_cdn, better_cdn)
2522
2523             effTitle = showId + u'-' + epTitle
2524             info = {
2525                 'id': shortMediaId,
2526                 'url': video_url,
2527                 'uploader': showId,
2528                 'upload_date': officialDate,
2529                 'title': effTitle,
2530                 'ext': 'mp4',
2531                 'format': format,
2532                 'thumbnail': None,
2533                 'description': officialTitle,
2534                 'player_url': None #playerUrl
2535             }
2536
2537             results.append(info)
2538
2539         return results
2540
2541
2542 class EscapistIE(InfoExtractor):
2543     """Information extractor for The Escapist """
2544
2545     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2546     IE_NAME = u'escapist'
2547
2548     def report_extraction(self, showName):
2549         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2550
2551     def report_config_download(self, showName):
2552         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2553
2554     def _real_extract(self, url):
2555         mobj = re.match(self._VALID_URL, url)
2556         if mobj is None:
2557             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2558             return
2559         showName = mobj.group('showname')
2560         videoId = mobj.group('episode')
2561
2562         self.report_extraction(showName)
2563         try:
2564             webPage = compat_urllib_request.urlopen(url)
2565             webPageBytes = webPage.read()
2566             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2567             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2568         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2570             return
2571
2572         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2573         description = unescapeHTML(descMatch.group(1))
2574         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2575         imgUrl = unescapeHTML(imgMatch.group(1))
2576         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2577         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2578         configUrlMatch = re.search('config=(.*)$', playerUrl)
2579         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2580
2581         self.report_config_download(showName)
2582         try:
2583             configJSON = compat_urllib_request.urlopen(configUrl)
2584             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2585             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2588             return
2589
2590         # Technically, it's JavaScript, not JSON
2591         configJSON = configJSON.replace("'", '"')
2592
2593         try:
2594             config = json.loads(configJSON)
2595         except (ValueError,) as err:
2596             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2597             return
2598
2599         playlist = config['playlist']
2600         videoUrl = playlist[1]['url']
2601
2602         info = {
2603             'id': videoId,
2604             'url': videoUrl,
2605             'uploader': showName,
2606             'upload_date': None,
2607             'title': showName,
2608             'ext': 'flv',
2609             'thumbnail': imgUrl,
2610             'description': description,
2611             'player_url': playerUrl,
2612         }
2613
2614         return [info]
2615
2616
2617 class CollegeHumorIE(InfoExtractor):
2618     """Information extractor for collegehumor.com"""
2619
2620     _WORKING = False
2621     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2622     IE_NAME = u'collegehumor'
2623
2624     def report_manifest(self, video_id):
2625         """Report information extraction."""
2626         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2627
2628     def report_extraction(self, video_id):
2629         """Report information extraction."""
2630         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631
2632     def _real_extract(self, url):
2633         mobj = re.match(self._VALID_URL, url)
2634         if mobj is None:
2635             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2636             return
2637         video_id = mobj.group('videoid')
2638
2639         info = {
2640             'id': video_id,
2641             'uploader': None,
2642             'upload_date': None,
2643         }
2644
2645         self.report_extraction(video_id)
2646         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2647         try:
2648             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2649         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2651             return
2652
2653         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2654         try:
2655             videoNode = mdoc.findall('./video')[0]
2656             info['description'] = videoNode.findall('./description')[0].text
2657             info['title'] = videoNode.findall('./caption')[0].text
2658             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2659             manifest_url = videoNode.findall('./file')[0].text
2660         except IndexError:
2661             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2662             return
2663
2664         manifest_url += '?hdcore=2.10.3'
2665         self.report_manifest(video_id)
2666         try:
2667             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2668         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2670             return
2671
2672         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2673         try:
2674             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2675             node_id = media_node.attrib['url']
2676             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2677         except IndexError as err:
2678             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2679             return
2680
2681         url_pr = compat_urllib_parse_urlparse(manifest_url)
2682         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2683
2684         info['url'] = url
2685         info['ext'] = 'f4f'
2686         return [info]
2687
2688
2689 class XVideosIE(InfoExtractor):
2690     """Information extractor for xvideos.com"""
2691
2692     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2693     IE_NAME = u'xvideos'
2694
2695     def report_extraction(self, video_id):
2696         """Report information extraction."""
2697         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698
2699     def _real_extract(self, url):
2700         mobj = re.match(self._VALID_URL, url)
2701         if mobj is None:
2702             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703             return
2704         video_id = mobj.group(1)
2705
2706         webpage = self._download_webpage(url, video_id)
2707
2708         self.report_extraction(video_id)
2709
2710
2711         # Extract video URL
2712         mobj = re.search(r'flv_url=(.+?)&', webpage)
2713         if mobj is None:
2714             self._downloader.trouble(u'ERROR: unable to extract video url')
2715             return
2716         video_url = compat_urllib_parse.unquote(mobj.group(1))
2717
2718
2719         # Extract title
2720         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2721         if mobj is None:
2722             self._downloader.trouble(u'ERROR: unable to extract video title')
2723             return
2724         video_title = mobj.group(1)
2725
2726
2727         # Extract video thumbnail
2728         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2729         if mobj is None:
2730             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2731             return
2732         video_thumbnail = mobj.group(0)
2733
2734         info = {
2735             'id': video_id,
2736             'url': video_url,
2737             'uploader': None,
2738             'upload_date': None,
2739             'title': video_title,
2740             'ext': 'flv',
2741             'thumbnail': video_thumbnail,
2742             'description': None,
2743         }
2744
2745         return [info]
2746
2747
2748 class SoundcloudIE(InfoExtractor):
2749     """Information extractor for soundcloud.com
2750        To access the media, the uid of the song and a stream token
2751        must be extracted from the page source and the script must make
2752        a request to media.soundcloud.com/crossdomain.xml. Then
2753        the media can be grabbed by requesting from an url composed
2754        of the stream token and uid
2755      """
2756
2757     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2758     IE_NAME = u'soundcloud'
2759
2760     def __init__(self, downloader=None):
2761         InfoExtractor.__init__(self, downloader)
2762
2763     def report_resolve(self, video_id):
2764         """Report information extraction."""
2765         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2766
2767     def report_extraction(self, video_id):
2768         """Report information extraction."""
2769         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2770
2771     def _real_extract(self, url):
2772         mobj = re.match(self._VALID_URL, url)
2773         if mobj is None:
2774             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2775             return
2776
2777         # extract uploader (which is in the url)
2778         uploader = mobj.group(1)
2779         # extract simple title (uploader + slug of song title)
2780         slug_title =  mobj.group(2)
2781         simple_title = uploader + u'-' + slug_title
2782
2783         self.report_resolve('%s/%s' % (uploader, slug_title))
2784
2785         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2786         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2787         request = compat_urllib_request.Request(resolv_url)
2788         try:
2789             info_json_bytes = compat_urllib_request.urlopen(request).read()
2790             info_json = info_json_bytes.decode('utf-8')
2791         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2793             return
2794
2795         info = json.loads(info_json)
2796         video_id = info['id']
2797         self.report_extraction('%s/%s' % (uploader, slug_title))
2798
2799         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2800         request = compat_urllib_request.Request(streams_url)
2801         try:
2802             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2803             stream_json = stream_json_bytes.decode('utf-8')
2804         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2805             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2806             return
2807
2808         streams = json.loads(stream_json)
2809         mediaURL = streams['http_mp3_128_url']
2810
2811         return [{
2812             'id':       info['id'],
2813             'url':      mediaURL,
2814             'uploader': info['user']['username'],
2815             'upload_date':  info['created_at'],
2816             'title':    info['title'],
2817             'ext':      u'mp3',
2818             'description': info['description'],
2819         }]
2820
2821
2822 class InfoQIE(InfoExtractor):
2823     """Information extractor for infoq.com"""
2824
2825     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2826     IE_NAME = u'infoq'
2827
2828     def report_extraction(self, video_id):
2829         """Report information extraction."""
2830         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2831
2832     def _real_extract(self, url):
2833         mobj = re.match(self._VALID_URL, url)
2834         if mobj is None:
2835             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2836             return
2837
2838         webpage = self._download_webpage(url, video_id=url)
2839         self.report_extraction(url)
2840
2841         # Extract video URL
2842         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2843         if mobj is None:
2844             self._downloader.trouble(u'ERROR: unable to extract video url')
2845             return
2846         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2847         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2848
2849         # Extract title
2850         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2851         if mobj is None:
2852             self._downloader.trouble(u'ERROR: unable to extract video title')
2853             return
2854         video_title = mobj.group(1)
2855
2856         # Extract description
2857         video_description = u'No description available.'
2858         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2859         if mobj is not None:
2860             video_description = mobj.group(1)
2861
2862         video_filename = video_url.split('/')[-1]
2863         video_id, extension = video_filename.split('.')
2864
2865         info = {
2866             'id': video_id,
2867             'url': video_url,
2868             'uploader': None,
2869             'upload_date': None,
2870             'title': video_title,
2871             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2872             'thumbnail': None,
2873             'description': video_description,
2874         }
2875
2876         return [info]
2877
2878 class MixcloudIE(InfoExtractor):
2879     """Information extractor for www.mixcloud.com"""
2880
2881     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2882     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2883     IE_NAME = u'mixcloud'
2884
2885     def __init__(self, downloader=None):
2886         InfoExtractor.__init__(self, downloader)
2887
2888     def report_download_json(self, file_id):
2889         """Report JSON download."""
2890         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2891
2892     def report_extraction(self, file_id):
2893         """Report information extraction."""
2894         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2895
2896     def get_urls(self, jsonData, fmt, bitrate='best'):
2897         """Get urls from 'audio_formats' section in json"""
2898         file_url = None
2899         try:
2900             bitrate_list = jsonData[fmt]
2901             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2902                 bitrate = max(bitrate_list) # select highest
2903
2904             url_list = jsonData[fmt][bitrate]
2905         except TypeError: # we have no bitrate info.
2906             url_list = jsonData[fmt]
2907         return url_list
2908
2909     def check_urls(self, url_list):
2910         """Returns 1st active url from list"""
2911         for url in url_list:
2912             try:
2913                 compat_urllib_request.urlopen(url)
2914                 return url
2915             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2916                 url = None
2917
2918         return None
2919
2920     def _print_formats(self, formats):
2921         print('Available formats:')
2922         for fmt in formats.keys():
2923             for b in formats[fmt]:
2924                 try:
2925                     ext = formats[fmt][b][0]
2926                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2927                 except TypeError: # we have no bitrate info
2928                     ext = formats[fmt][0]
2929                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2930                     break
2931
2932     def _real_extract(self, url):
2933         mobj = re.match(self._VALID_URL, url)
2934         if mobj is None:
2935             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2936             return
2937         # extract uploader & filename from url
2938         uploader = mobj.group(1).decode('utf-8')
2939         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2940
2941         # construct API request
2942         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2943         # retrieve .json file with links to files
2944         request = compat_urllib_request.Request(file_url)
2945         try:
2946             self.report_download_json(file_url)
2947             jsonData = compat_urllib_request.urlopen(request).read()
2948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2949             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2950             return
2951
2952         # parse JSON
2953         json_data = json.loads(jsonData)
2954         player_url = json_data['player_swf_url']
2955         formats = dict(json_data['audio_formats'])
2956
2957         req_format = self._downloader.params.get('format', None)
2958         bitrate = None
2959
2960         if self._downloader.params.get('listformats', None):
2961             self._print_formats(formats)
2962             return
2963
2964         if req_format is None or req_format == 'best':
2965             for format_param in formats.keys():
2966                 url_list = self.get_urls(formats, format_param)
2967                 # check urls
2968                 file_url = self.check_urls(url_list)
2969                 if file_url is not None:
2970                     break # got it!
2971         else:
2972             if req_format not in formats:
2973                 self._downloader.trouble(u'ERROR: format is not available')
2974                 return
2975
2976             url_list = self.get_urls(formats, req_format)
2977             file_url = self.check_urls(url_list)
2978             format_param = req_format
2979
2980         return [{
2981             'id': file_id.decode('utf-8'),
2982             'url': file_url.decode('utf-8'),
2983             'uploader': uploader.decode('utf-8'),
2984             'upload_date': None,
2985             'title': json_data['name'],
2986             'ext': file_url.split('.')[-1].decode('utf-8'),
2987             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2988             'thumbnail': json_data['thumbnail_url'],
2989             'description': json_data['description'],
2990             'player_url': player_url.decode('utf-8'),
2991         }]
2992
2993 class StanfordOpenClassroomIE(InfoExtractor):
2994     """Information extractor for Stanford's Open ClassRoom"""
2995
2996     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2997     IE_NAME = u'stanfordoc'
2998
2999     def report_download_webpage(self, objid):
3000         """Report information extraction."""
3001         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3002
3003     def report_extraction(self, video_id):
3004         """Report information extraction."""
3005         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3006
3007     def _real_extract(self, url):
3008         mobj = re.match(self._VALID_URL, url)
3009         if mobj is None:
3010             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3011             return
3012
3013         if mobj.group('course') and mobj.group('video'): # A specific video
3014             course = mobj.group('course')
3015             video = mobj.group('video')
3016             info = {
3017                 'id': course + '_' + video,
3018                 'uploader': None,
3019                 'upload_date': None,
3020             }
3021
3022             self.report_extraction(info['id'])
3023             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3024             xmlUrl = baseUrl + video + '.xml'
3025             try:
3026                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3027             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3029                 return
3030             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3031             try:
3032                 info['title'] = mdoc.findall('./title')[0].text
3033                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3034             except IndexError:
3035                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3036                 return
3037             info['ext'] = info['url'].rpartition('.')[2]
3038             return [info]
3039         elif mobj.group('course'): # A course page
3040             course = mobj.group('course')
3041             info = {
3042                 'id': course,
3043                 'type': 'playlist',
3044                 'uploader': None,
3045                 'upload_date': None,
3046             }
3047
3048             self.report_download_webpage(info['id'])
3049             try:
3050                 coursepage = compat_urllib_request.urlopen(url).read()
3051             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3052                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3053                 return
3054
3055             m = re.search('<h1>([^<]+)</h1>', coursepage)
3056             if m:
3057                 info['title'] = unescapeHTML(m.group(1))
3058             else:
3059                 info['title'] = info['id']
3060
3061             m = re.search('<description>([^<]+)</description>', coursepage)
3062             if m:
3063                 info['description'] = unescapeHTML(m.group(1))
3064
3065             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3066             info['list'] = [
3067                 {
3068                     'type': 'reference',
3069                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3070                 }
3071                     for vpage in links]
3072             results = []
3073             for entry in info['list']:
3074                 assert entry['type'] == 'reference'
3075                 results += self.extract(entry['url'])
3076             return results
3077
3078         else: # Root page
3079             info = {
3080                 'id': 'Stanford OpenClassroom',
3081                 'type': 'playlist',
3082                 'uploader': None,
3083                 'upload_date': None,
3084             }
3085
3086             self.report_download_webpage(info['id'])
3087             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3088             try:
3089                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3090             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3092                 return
3093
3094             info['title'] = info['id']
3095
3096             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3097             info['list'] = [
3098                 {
3099                     'type': 'reference',
3100                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3101                 }
3102                     for cpage in links]
3103
3104             results = []
3105             for entry in info['list']:
3106                 assert entry['type'] == 'reference'
3107                 results += self.extract(entry['url'])
3108             return results
3109
3110 class MTVIE(InfoExtractor):
3111     """Information extractor for MTV.com"""
3112
3113     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3114     IE_NAME = u'mtv'
3115
3116     def report_extraction(self, video_id):
3117         """Report information extraction."""
3118         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3119
3120     def _real_extract(self, url):
3121         mobj = re.match(self._VALID_URL, url)
3122         if mobj is None:
3123             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3124             return
3125         if not mobj.group('proto'):
3126             url = 'http://' + url
3127         video_id = mobj.group('videoid')
3128
3129         webpage = self._download_webpage(url, video_id)
3130
3131         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3132         if mobj is None:
3133             self._downloader.trouble(u'ERROR: unable to extract song name')
3134             return
3135         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3136         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3137         if mobj is None:
3138             self._downloader.trouble(u'ERROR: unable to extract performer')
3139             return
3140         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3141         video_title = performer + ' - ' + song_name
3142
3143         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3144         if mobj is None:
3145             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3146             return
3147         mtvn_uri = mobj.group(1)
3148
3149         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3150         if mobj is None:
3151             self._downloader.trouble(u'ERROR: unable to extract content id')
3152             return
3153         content_id = mobj.group(1)
3154
3155         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3156         self.report_extraction(video_id)
3157         request = compat_urllib_request.Request(videogen_url)
3158         try:
3159             metadataXml = compat_urllib_request.urlopen(request).read()
3160         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3162             return
3163
3164         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3165         renditions = mdoc.findall('.//rendition')
3166
3167         # For now, always pick the highest quality.
3168         rendition = renditions[-1]
3169
3170         try:
3171             _,_,ext = rendition.attrib['type'].partition('/')
3172             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3173             video_url = rendition.find('./src').text
3174         except KeyError:
3175             self._downloader.trouble('Invalid rendition field.')
3176             return
3177
3178         info = {
3179             'id': video_id,
3180             'url': video_url,
3181             'uploader': performer,
3182             'upload_date': None,
3183             'title': video_title,
3184             'ext': ext,
3185             'format': format,
3186         }
3187
3188         return [info]
3189
3190
3191 class YoukuIE(InfoExtractor):
3192     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3193
3194     def report_download_webpage(self, file_id):
3195         """Report webpage download."""
3196         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3197
3198     def report_extraction(self, file_id):
3199         """Report information extraction."""
3200         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3201
3202     def _gen_sid(self):
3203         nowTime = int(time.time() * 1000)
3204         random1 = random.randint(1000,1998)
3205         random2 = random.randint(1000,9999)
3206
3207         return "%d%d%d" %(nowTime,random1,random2)
3208
3209     def _get_file_ID_mix_string(self, seed):
3210         mixed = []
3211         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3212         seed = float(seed)
3213         for i in range(len(source)):
3214             seed  =  (seed * 211 + 30031 ) % 65536
3215             index  =  math.floor(seed / 65536 * len(source) )
3216             mixed.append(source[int(index)])
3217             source.remove(source[int(index)])
3218         #return ''.join(mixed)
3219         return mixed
3220
3221     def _get_file_id(self, fileId, seed):
3222         mixed = self._get_file_ID_mix_string(seed)
3223         ids = fileId.split('*')
3224         realId = []
3225         for ch in ids:
3226             if ch:
3227                 realId.append(mixed[int(ch)])
3228         return ''.join(realId)
3229
3230     def _real_extract(self, url):
3231         mobj = re.match(self._VALID_URL, url)
3232         if mobj is None:
3233             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3234             return
3235         video_id = mobj.group('ID')
3236
3237         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3238
3239         request = compat_urllib_request.Request(info_url, None, std_headers)
3240         try:
3241             self.report_download_webpage(video_id)
3242             jsondata = compat_urllib_request.urlopen(request).read()
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3245             return
3246
3247         self.report_extraction(video_id)
3248         try:
3249             jsonstr = jsondata.decode('utf-8')
3250             config = json.loads(jsonstr)
3251
3252             video_title =  config['data'][0]['title']
3253             seed = config['data'][0]['seed']
3254
3255             format = self._downloader.params.get('format', None)
3256             supported_format = list(config['data'][0]['streamfileids'].keys())
3257
3258             if format is None or format == 'best':
3259                 if 'hd2' in supported_format:
3260                     format = 'hd2'
3261                 else:
3262                     format = 'flv'
3263                 ext = u'flv'
3264             elif format == 'worst':
3265                 format = 'mp4'
3266                 ext = u'mp4'
3267             else:
3268                 format = 'flv'
3269                 ext = u'flv'
3270
3271
3272             fileid = config['data'][0]['streamfileids'][format]
3273             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3274         except (UnicodeDecodeError, ValueError, KeyError):
3275             self._downloader.trouble(u'ERROR: unable to extract info section')
3276             return
3277
3278         files_info=[]
3279         sid = self._gen_sid()
3280         fileid = self._get_file_id(fileid, seed)
3281
3282         #column 8,9 of fileid represent the segment number
3283         #fileid[7:9] should be changed
3284         for index, key in enumerate(keys):
3285
3286             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3287             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3288
3289             info = {
3290                 'id': '%s_part%02d' % (video_id, index),
3291                 'url': download_url,
3292                 'uploader': None,
3293                 'upload_date': None,
3294                 'title': video_title,
3295                 'ext': ext,
3296             }
3297             files_info.append(info)
3298
3299         return files_info
3300
3301
3302 class XNXXIE(InfoExtractor):
3303     """Information extractor for xnxx.com"""
3304
3305     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3306     IE_NAME = u'xnxx'
3307     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3308     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3309     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3310
3311     def report_webpage(self, video_id):
3312         """Report information extraction"""
3313         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3314
3315     def report_extraction(self, video_id):
3316         """Report information extraction"""
3317         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3318
3319     def _real_extract(self, url):
3320         mobj = re.match(self._VALID_URL, url)
3321         if mobj is None:
3322             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3323             return
3324         video_id = mobj.group(1)
3325
3326         self.report_webpage(video_id)
3327
3328         # Get webpage content
3329         try:
3330             webpage_bytes = compat_urllib_request.urlopen(url).read()
3331             webpage = webpage_bytes.decode('utf-8')
3332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3334             return
3335
3336         result = re.search(self.VIDEO_URL_RE, webpage)
3337         if result is None:
3338             self._downloader.trouble(u'ERROR: unable to extract video url')
3339             return
3340         video_url = compat_urllib_parse.unquote(result.group(1))
3341
3342         result = re.search(self.VIDEO_TITLE_RE, webpage)
3343         if result is None:
3344             self._downloader.trouble(u'ERROR: unable to extract video title')
3345             return
3346         video_title = result.group(1)
3347
3348         result = re.search(self.VIDEO_THUMB_RE, webpage)
3349         if result is None:
3350             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3351             return
3352         video_thumbnail = result.group(1)
3353
3354         return [{
3355             'id': video_id,
3356             'url': video_url,
3357             'uploader': None,
3358             'upload_date': None,
3359             'title': video_title,
3360             'ext': 'flv',
3361             'thumbnail': video_thumbnail,
3362             'description': None,
3363         }]
3364
3365
3366 class GooglePlusIE(InfoExtractor):
3367     """Information extractor for plus.google.com."""
3368
3369     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3370     IE_NAME = u'plus.google'
3371
3372     def __init__(self, downloader=None):
3373         InfoExtractor.__init__(self, downloader)
3374
3375     def report_extract_entry(self, url):
3376         """Report downloading extry"""
3377         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3378
3379     def report_date(self, upload_date):
3380         """Report downloading extry"""
3381         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3382
3383     def report_uploader(self, uploader):
3384         """Report downloading extry"""
3385         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3386
3387     def report_title(self, video_title):
3388         """Report downloading extry"""
3389         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3390
3391     def report_extract_vid_page(self, video_page):
3392         """Report information extraction."""
3393         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3394
3395     def _real_extract(self, url):
3396         # Extract id from URL
3397         mobj = re.match(self._VALID_URL, url)
3398         if mobj is None:
3399             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3400             return
3401
3402         post_url = mobj.group(0)
3403         video_id = mobj.group(1)
3404
3405         video_extension = 'flv'
3406
3407         # Step 1, Retrieve post webpage to extract further information
3408         self.report_extract_entry(post_url)
3409         request = compat_urllib_request.Request(post_url)
3410         try:
3411             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3414             return
3415
3416         # Extract update date
3417         upload_date = None
3418         pattern = 'title="Timestamp">(.*?)</a>'
3419         mobj = re.search(pattern, webpage)
3420         if mobj:
3421             upload_date = mobj.group(1)
3422             # Convert timestring to a format suitable for filename
3423             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3424             upload_date = upload_date.strftime('%Y%m%d')
3425         self.report_date(upload_date)
3426
3427         # Extract uploader
3428         uploader = None
3429         pattern = r'rel\="author".*?>(.*?)</a>'
3430         mobj = re.search(pattern, webpage)
3431         if mobj:
3432             uploader = mobj.group(1)
3433         self.report_uploader(uploader)
3434
3435         # Extract title
3436         # Get the first line for title
3437         video_title = u'NA'
3438         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3439         mobj = re.search(pattern, webpage)
3440         if mobj:
3441             video_title = mobj.group(1)
3442         self.report_title(video_title)
3443
3444         # Step 2, Stimulate clicking the image box to launch video
3445         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3446         mobj = re.search(pattern, webpage)
3447         if mobj is None:
3448             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3449
3450         video_page = mobj.group(1)
3451         request = compat_urllib_request.Request(video_page)
3452         try:
3453             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3455             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3456             return
3457         self.report_extract_vid_page(video_page)
3458
3459
3460         # Extract video links on video page
3461         """Extract video links of all sizes"""
3462         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3463         mobj = re.findall(pattern, webpage)
3464         if len(mobj) == 0:
3465             self._downloader.trouble(u'ERROR: unable to extract video links')
3466
3467         # Sort in resolution
3468         links = sorted(mobj)
3469
3470         # Choose the lowest of the sort, i.e. highest resolution
3471         video_url = links[-1]
3472         # Only get the url. The resolution part in the tuple has no use anymore
3473         video_url = video_url[-1]
3474         # Treat escaped \u0026 style hex
3475         try:
3476             video_url = video_url.decode("unicode_escape")
3477         except AttributeError: # Python 3
3478             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3479
3480
3481         return [{
3482             'id':       video_id,
3483             'url':      video_url,
3484             'uploader': uploader,
3485             'upload_date':  upload_date,
3486             'title':    video_title,
3487             'ext':      video_extension,
3488         }]
3489
3490 class NBAIE(InfoExtractor):
3491     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3492     IE_NAME = u'nba'
3493
3494     def _real_extract(self, url):
3495         mobj = re.match(self._VALID_URL, url)
3496         if mobj is None:
3497             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3498             return
3499
3500         video_id = mobj.group(1)
3501         if video_id.endswith('/index.html'):
3502             video_id = video_id[:-len('/index.html')]
3503
3504         webpage = self._download_webpage(url, video_id)
3505
3506         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3507         def _findProp(rexp, default=None):
3508             m = re.search(rexp, webpage)
3509             if m:
3510                 return unescapeHTML(m.group(1))
3511             else:
3512                 return default
3513
3514         shortened_video_id = video_id.rpartition('/')[2]
3515         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3516         info = {
3517             'id': shortened_video_id,
3518             'url': video_url,
3519             'ext': 'mp4',
3520             'title': title,
3521             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3522             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3523         }
3524         return [info]
3525
3526 class JustinTVIE(InfoExtractor):
3527     """Information extractor for justin.tv and twitch.tv"""
3528     # TODO: One broadcast may be split into multiple videos. The key
3529     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3530     # starts at 1 and increases. Can we treat all parts as one video?
3531
3532     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3533         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3534     _JUSTIN_PAGE_LIMIT = 100
3535     IE_NAME = u'justin.tv'
3536
3537     def report_extraction(self, file_id):
3538         """Report information extraction."""
3539         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3540
3541     def report_download_page(self, channel, offset):
3542         """Report attempt to download a single page of videos."""
3543         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3544                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3545
3546     # Return count of items, list of *valid* items
3547     def _parse_page(self, url):
3548         try:
3549             urlh = compat_urllib_request.urlopen(url)
3550             webpage_bytes = urlh.read()
3551             webpage = webpage_bytes.decode('utf-8', 'ignore')
3552         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3553             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3554             return
3555
3556         response = json.loads(webpage)
3557         info = []
3558         for clip in response:
3559             video_url = clip['video_file_url']
3560             if video_url:
3561                 video_extension = os.path.splitext(video_url)[1][1:]
3562                 video_date = re.sub('-', '', clip['created_on'][:10])
3563                 info.append({
3564                     'id': clip['id'],
3565                     'url': video_url,
3566                     'title': clip['title'],
3567                     'uploader': clip.get('user_id', clip.get('channel_id')),
3568                     'upload_date': video_date,
3569                     'ext': video_extension,
3570                 })
3571         return (len(response), info)
3572
3573     def _real_extract(self, url):
3574         mobj = re.match(self._VALID_URL, url)
3575         if mobj is None:
3576             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3577             return
3578
3579         api = 'http://api.justin.tv'
3580         video_id = mobj.group(mobj.lastindex)
3581         paged = False
3582         if mobj.lastindex == 1:
3583             paged = True
3584             api += '/channel/archives/%s.json'
3585         else:
3586             api += '/clip/show/%s.json'
3587         api = api % (video_id,)
3588
3589         self.report_extraction(video_id)
3590
3591         info = []
3592         offset = 0
3593         limit = self._JUSTIN_PAGE_LIMIT
3594         while True:
3595             if paged:
3596                 self.report_download_page(video_id, offset)
3597             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3598             page_count, page_info = self._parse_page(page_url)
3599             info.extend(page_info)
3600             if not paged or page_count != limit:
3601                 break
3602             offset += limit
3603         return info
3604
3605 class FunnyOrDieIE(InfoExtractor):
3606     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3607
3608     def _real_extract(self, url):
3609         mobj = re.match(self._VALID_URL, url)
3610         if mobj is None:
3611             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3612             return
3613
3614         video_id = mobj.group('id')
3615         webpage = self._download_webpage(url, video_id)
3616
3617         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3618         if not m:
3619             self._downloader.trouble(u'ERROR: unable to find video information')
3620         video_url = unescapeHTML(m.group('url'))
3621
3622         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3623         if not m:
3624             self._downloader.trouble(u'Cannot find video title')
3625         title = unescapeHTML(m.group('title'))
3626
3627         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3628         if m:
3629             desc = unescapeHTML(m.group('desc'))
3630         else:
3631             desc = None
3632
3633         info = {
3634             'id': video_id,
3635             'url': video_url,
3636             'ext': 'mp4',
3637             'title': title,
3638             'description': desc,
3639         }
3640         return [info]
3641
3642 class TweetReelIE(InfoExtractor):
3643     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3644
3645     def _real_extract(self, url):
3646         mobj = re.match(self._VALID_URL, url)
3647         if mobj is None:
3648             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3649             return
3650
3651         video_id = mobj.group('id')
3652         webpage = self._download_webpage(url, video_id)
3653
3654         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3655         if not m:
3656             self._downloader.trouble(u'ERROR: Cannot find status ID')
3657         status_id = m.group(1)
3658
3659         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3660         if not m:
3661             self._downloader.trouble(u'WARNING: Cannot find description')
3662         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3663
3664         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3665         if not m:
3666             self._downloader.trouble(u'ERROR: Cannot find uploader')
3667         uploader = unescapeHTML(m.group('uploader'))
3668         uploader_id = unescapeHTML(m.group('uploader_id'))
3669
3670         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3671         if not m:
3672             self._downloader.trouble(u'ERROR: Cannot find upload date')
3673         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3674
3675         title = desc
3676         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3677
3678         info = {
3679             'id': video_id,
3680             'url': video_url,
3681             'ext': 'mov',
3682             'title': title,
3683             'description': desc,
3684             'uploader': uploader,
3685             'uploader_id': uploader_id,
3686             'internal_id': status_id,
3687             'upload_date': upload_date
3688         }
3689         return [info]
3690         
3691 class SteamIE(InfoExtractor):
3692     _VALID_URL = r"""http://store.steampowered.com/ 
3693                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3694                 (?P<gameID>\d+)/?
3695                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3696                 """
3697
3698     def suitable(self, url):
3699         """Receives a URL and returns True if suitable for this IE."""
3700         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3701
3702     def _real_extract(self, url):
3703         m = re.match(self._VALID_URL, url, re.VERBOSE)
3704         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3705         gameID = m.group('gameID')
3706         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3707         webpage = self._download_webpage(videourl, gameID)
3708         mweb = re.finditer(urlRE, webpage)
3709         namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3710         titles = list(re.finditer(namesRE, webpage))
3711         videos = []
3712         for vid,vtitle in zip(mweb,titles):
3713             video_id = vid.group('videoID')
3714             title = vtitle.group('videoName')
3715             video_url = vid.group('videoURL')
3716             if not video_url:
3717                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3718             info = {
3719                 'id':video_id,
3720                 'url':video_url,
3721                 'ext': 'flv',
3722                 'title': title
3723                   }
3724             videos.append(info)
3725         return videos
3726         
3727 class UstreamIE(InfoExtractor):
3728     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3729     IE_NAME = u'ustream'
3730     
3731     def _real_extract(self, url):
3732         m = re.match(self._VALID_URL, url)
3733         video_id = m.group('videoID')
3734         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3735         webpage = self._download_webpage(url, video_id)
3736         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3737         title = m.group('title')
3738         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3739         uploader = m.group('uploader')
3740         info = {
3741                 'id':video_id,
3742                 'url':video_url,
3743                 'ext': 'flv',
3744                 'title': title,
3745                 'uploader': uploader
3746                   }
3747         return [info]
3748
3749
3750 def gen_extractors():
3751     """ Return a list of an instance of every supported extractor.
3752     The order does matter; the first extractor matched is the one handling the URL.
3753     """
3754     return [
3755         YoutubePlaylistIE(),
3756         YoutubeChannelIE(),
3757         YoutubeUserIE(),
3758         YoutubeSearchIE(),
3759         YoutubeIE(),
3760         MetacafeIE(),
3761         DailymotionIE(),
3762         GoogleSearchIE(),
3763         PhotobucketIE(),
3764         YahooIE(),
3765         YahooSearchIE(),
3766         DepositFilesIE(),
3767         FacebookIE(),
3768         BlipTVUserIE(),
3769         BlipTVIE(),
3770         VimeoIE(),
3771         MyVideoIE(),
3772         ComedyCentralIE(),
3773         EscapistIE(),
3774         CollegeHumorIE(),
3775         XVideosIE(),
3776         SoundcloudIE(),
3777         InfoQIE(),
3778         MixcloudIE(),
3779         StanfordOpenClassroomIE(),
3780         MTVIE(),
3781         YoukuIE(),
3782         XNXXIE(),
3783         GooglePlusIE(),
3784         ArteTvIE(),
3785         NBAIE(),
3786         JustinTVIE(),
3787         FunnyOrDieIE(),
3788         TweetReelIE(),
3789         SteamIE(),
3790         UstreamIE(),
3791         GenericIE()
3792     ]
3793
3794