TweetReel IE
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21     """Information Extractor class.
22
23     Information extractors are the classes that, given a URL, extract
24     information about the video (or videos) the URL refers to. This
25     information includes the real video URL, the video title, author and
26     others. The information is stored in a dictionary which is then
27     passed to the FileDownloader. The FileDownloader processes this
28     information possibly downloading the video to the file system, among
29     other possible outcomes.
30
31     The dictionaries must include the following fields:
32
33     id:             Video identifier.
34     url:            Final video URL.
35     title:          Video title, unescaped.
36     ext:            Video filename extension.
37     uploader:       Full name of the video uploader.
38     upload_date:    Video upload date (YYYYMMDD).
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader_id:    Nickname or id of the video uploader.
46     player_url:     SWF Player URL (used for rtmpdump).
47     subtitles:      The .srt file contents.
48     urlhandle:      [internal] The urlHandle to be used to download the file,
49                     like returned by urllib.request.urlopen
50
51     The fields should all be Unicode strings.
52
53     Subclasses of this one should re-define the _real_initialize() and
54     _real_extract() methods and define a _VALID_URL regexp.
55     Probably, they should also be added to the list of extractors.
56
57     _real_extract() must return a *list* of information dictionaries as
58     described above.
59
60     Finally, the _WORKING attribute should be set to False for broken IEs
61     in order to warn the users and skip the tests.
62     """
63
64     _ready = False
65     _downloader = None
66     _WORKING = True
67
68     def __init__(self, downloader=None):
69         """Constructor. Receives an optional downloader."""
70         self._ready = False
71         self.set_downloader(downloader)
72
73     def suitable(self, url):
74         """Receives a URL and returns True if suitable for this IE."""
75         return re.match(self._VALID_URL, url) is not None
76
77     def working(self):
78         """Getter method for _WORKING."""
79         return self._WORKING
80
81     def initialize(self):
82         """Initializes an instance (authentication, etc)."""
83         if not self._ready:
84             self._real_initialize()
85             self._ready = True
86
87     def extract(self, url):
88         """Extracts URL information and returns it in list of dicts."""
89         self.initialize()
90         return self._real_extract(url)
91
92     def set_downloader(self, downloader):
93         """Sets the downloader for this IE."""
94         self._downloader = downloader
95
96     def _real_initialize(self):
97         """Real initialization process. Redefine in subclasses."""
98         pass
99
100     def _real_extract(self, url):
101         """Real extraction process. Redefine in subclasses."""
102         pass
103
104     @property
105     def IE_NAME(self):
106         return type(self).__name__[:-2]
107
108 class YoutubeIE(InfoExtractor):
109     """Information extractor for youtube.com."""
110
111     _VALID_URL = r"""^
112                      (
113                          (?:https?://)?                                       # http(s):// (optional)
114                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
116                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
117                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
118                          (?:                                                  # the various things that can precede the ID:
119                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
120                              |(?:                                             # or the v= param in all its forms
121                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
123                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
124                                  v=
125                              )
126                          )?                                                   # optional -> youtube.com/xxxx is OK
127                      )?                                                       # all until now is optional -> you can pass the naked ID
128                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
129                      (?(1).+)?                                                # if we found the ID, everything can follow
130                      $"""
131     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135     _NETRC_MACHINE = 'youtube'
136     # Listed in order of quality
137     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139     _video_extensions = {
140         '13': '3gp',
141         '17': 'mp4',
142         '18': 'mp4',
143         '22': 'mp4',
144         '37': 'mp4',
145         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
146         '43': 'webm',
147         '44': 'webm',
148         '45': 'webm',
149         '46': 'webm',
150     }
151     _video_dimensions = {
152         '5': '240x400',
153         '6': '???',
154         '13': '???',
155         '17': '144x176',
156         '18': '360x640',
157         '22': '720x1280',
158         '34': '360x640',
159         '35': '480x854',
160         '37': '1080x1920',
161         '38': '3072x4096',
162         '43': '360x640',
163         '44': '480x854',
164         '45': '720x1280',
165         '46': '1080x1920',
166     }
167     IE_NAME = u'youtube'
168
169     def suitable(self, url):
170         """Receives a URL and returns True if suitable for this IE."""
171         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
172
173     def report_lang(self):
174         """Report attempt to set language."""
175         self._downloader.to_screen(u'[youtube] Setting language')
176
177     def report_login(self):
178         """Report attempt to log in."""
179         self._downloader.to_screen(u'[youtube] Logging in')
180
181     def report_age_confirmation(self):
182         """Report attempt to confirm age."""
183         self._downloader.to_screen(u'[youtube] Confirming age')
184
185     def report_video_webpage_download(self, video_id):
186         """Report attempt to download video webpage."""
187         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
188
189     def report_video_info_webpage_download(self, video_id):
190         """Report attempt to download video info webpage."""
191         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
192
193     def report_video_subtitles_download(self, video_id):
194         """Report attempt to download video info webpage."""
195         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
196
197     def report_information_extraction(self, video_id):
198         """Report attempt to extract video information."""
199         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
200
201     def report_unavailable_format(self, video_id, format):
202         """Report extracted video URL."""
203         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
204
205     def report_rtmp_download(self):
206         """Indicate the download will use the RTMP protocol."""
207         self._downloader.to_screen(u'[youtube] RTMP download detected')
208
209     def _closed_captions_xml_to_srt(self, xml_string):
210         srt = ''
211         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212         # TODO parse xml instead of regex
213         for n, (start, dur_tag, dur, caption) in enumerate(texts):
214             if not dur: dur = '4'
215             start = float(start)
216             end = start + float(dur)
217             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219             caption = unescapeHTML(caption)
220             caption = unescapeHTML(caption) # double cycle, intentional
221             srt += str(n+1) + '\n'
222             srt += start + ' --> ' + end + '\n'
223             srt += caption + '\n\n'
224         return srt
225
226     def _extract_subtitles(self, video_id):
227         self.report_video_subtitles_download(video_id)
228         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
229         try:
230             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235         if not srt_lang_list:
236             return (u'WARNING: video has no closed captions', None)
237         if self._downloader.params.get('subtitleslang', False):
238             srt_lang = self._downloader.params.get('subtitleslang')
239         elif 'en' in srt_lang_list:
240             srt_lang = 'en'
241         else:
242             srt_lang = list(srt_lang_list.keys())[0]
243         if not srt_lang in srt_lang_list:
244             return (u'WARNING: no closed captions found in the specified language', None)
245         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
246         try:
247             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
250         if not srt_xml:
251             return (u'WARNING: unable to download video subtitles', None)
252         return (None, self._closed_captions_xml_to_srt(srt_xml))
253
254     def _print_formats(self, formats):
255         print('Available formats:')
256         for x in formats:
257             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
258
259     def _real_initialize(self):
260         if self._downloader is None:
261             return
262
263         username = None
264         password = None
265         downloader_params = self._downloader.params
266
267         # Attempt to use provided username and password or .netrc data
268         if downloader_params.get('username', None) is not None:
269             username = downloader_params['username']
270             password = downloader_params['password']
271         elif downloader_params.get('usenetrc', False):
272             try:
273                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
274                 if info is not None:
275                     username = info[0]
276                     password = info[2]
277                 else:
278                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279             except (IOError, netrc.NetrcParseError) as err:
280                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
281                 return
282
283         # Set language
284         request = compat_urllib_request.Request(self._LANG_URL)
285         try:
286             self.report_lang()
287             compat_urllib_request.urlopen(request).read()
288         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
290             return
291
292         # No authentication to be performed
293         if username is None:
294             return
295
296         # Log in
297         login_form = {
298                 'current_form': 'loginForm',
299                 'next':     '/',
300                 'action_login': 'Log In',
301                 'username': username,
302                 'password': password,
303                 }
304         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
305         try:
306             self.report_login()
307             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
310                 return
311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
313             return
314
315         # Confirm age
316         age_form = {
317                 'next_url':     '/',
318                 'action_confirm':   'Confirm',
319                 }
320         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
321         try:
322             self.report_age_confirmation()
323             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
326             return
327
328     def _real_extract(self, url):
329         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
330         mobj = re.search(self._NEXT_URL_RE, url)
331         if mobj:
332             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
333
334         # Extract video id from URL
335         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
336         if mobj is None:
337             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
338             return
339         video_id = mobj.group(2)
340
341         # Get video webpage
342         self.report_video_webpage_download(video_id)
343         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
344         try:
345             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
348             return
349
350         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
351
352         # Attempt to extract SWF player URL
353         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
354         if mobj is not None:
355             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
356         else:
357             player_url = None
358
359         # Get video info
360         self.report_video_info_webpage_download(video_id)
361         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
362             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
363                     % (video_id, el_type))
364             request = compat_urllib_request.Request(video_info_url)
365             try:
366                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
367                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
368                 video_info = compat_parse_qs(video_info_webpage)
369                 if 'token' in video_info:
370                     break
371             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
373                 return
374         if 'token' not in video_info:
375             if 'reason' in video_info:
376                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
377             else:
378                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
379             return
380
381         # Check for "rental" videos
382         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
383             self._downloader.trouble(u'ERROR: "rental" videos not supported')
384             return
385
386         # Start extracting information
387         self.report_information_extraction(video_id)
388
389         # uploader
390         if 'author' not in video_info:
391             self._downloader.trouble(u'ERROR: unable to extract uploader name')
392             return
393         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
394
395         # uploader_id
396         video_uploader_id = None
397         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
398         if mobj is not None:
399             video_uploader_id = mobj.group(1)
400         else:
401             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
402
403         # title
404         if 'title' not in video_info:
405             self._downloader.trouble(u'ERROR: unable to extract video title')
406             return
407         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
408
409         # thumbnail image
410         if 'thumbnail_url' not in video_info:
411             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
412             video_thumbnail = ''
413         else:   # don't panic if we can't find it
414             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
415
416         # upload date
417         upload_date = None
418         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
419         if mobj is not None:
420             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
421             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
422             for expression in format_expressions:
423                 try:
424                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
425                 except:
426                     pass
427
428         # description
429         video_description = get_element_by_id("eow-description", video_webpage)
430         if video_description:
431             video_description = clean_html(video_description)
432         else:
433             video_description = ''
434
435         # closed captions
436         video_subtitles = None
437         if self._downloader.params.get('writesubtitles', False):
438             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
439             if srt_error:
440                 self._downloader.trouble(srt_error)
441
442         if 'length_seconds' not in video_info:
443             self._downloader.trouble(u'WARNING: unable to extract video duration')
444             video_duration = ''
445         else:
446             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
447
448         # token
449         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
450
451         # Decide which formats to download
452         req_format = self._downloader.params.get('format', None)
453
454         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
455             self.report_rtmp_download()
456             video_url_list = [(None, video_info['conn'][0])]
457         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
458             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
459             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
460             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
461             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
462
463             format_limit = self._downloader.params.get('format_limit', None)
464             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
465             if format_limit is not None and format_limit in available_formats:
466                 format_list = available_formats[available_formats.index(format_limit):]
467             else:
468                 format_list = available_formats
469             existing_formats = [x for x in format_list if x in url_map]
470             if len(existing_formats) == 0:
471                 self._downloader.trouble(u'ERROR: no known formats available for video')
472                 return
473             if self._downloader.params.get('listformats', None):
474                 self._print_formats(existing_formats)
475                 return
476             if req_format is None or req_format == 'best':
477                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
478             elif req_format == 'worst':
479                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
480             elif req_format in ('-1', 'all'):
481                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
482             else:
483                 # Specific formats. We pick the first in a slash-delimeted sequence.
484                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
485                 req_formats = req_format.split('/')
486                 video_url_list = None
487                 for rf in req_formats:
488                     if rf in url_map:
489                         video_url_list = [(rf, url_map[rf])]
490                         break
491                 if video_url_list is None:
492                     self._downloader.trouble(u'ERROR: requested format not available')
493                     return
494         else:
495             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
496             return
497
498         results = []
499         for format_param, video_real_url in video_url_list:
500             # Extension
501             video_extension = self._video_extensions.get(format_param, 'flv')
502
503             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
504                                               self._video_dimensions.get(format_param, '???'))
505
506             results.append({
507                 'id':       video_id,
508                 'url':      video_real_url,
509                 'uploader': video_uploader,
510                 'uploader_id': video_uploader_id,
511                 'upload_date':  upload_date,
512                 'title':    video_title,
513                 'ext':      video_extension,
514                 'format':   video_format,
515                 'thumbnail':    video_thumbnail,
516                 'description':  video_description,
517                 'player_url':   player_url,
518                 'subtitles':    video_subtitles,
519                 'duration':     video_duration
520             })
521         return results
522
523
524 class MetacafeIE(InfoExtractor):
525     """Information Extractor for metacafe.com."""
526
527     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
528     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
529     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
530     IE_NAME = u'metacafe'
531
532     def __init__(self, downloader=None):
533         InfoExtractor.__init__(self, downloader)
534
535     def report_disclaimer(self):
536         """Report disclaimer retrieval."""
537         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
538
539     def report_age_confirmation(self):
540         """Report attempt to confirm age."""
541         self._downloader.to_screen(u'[metacafe] Confirming age')
542
543     def report_download_webpage(self, video_id):
544         """Report webpage download."""
545         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
546
547     def report_extraction(self, video_id):
548         """Report information extraction."""
549         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
550
551     def _real_initialize(self):
552         # Retrieve disclaimer
553         request = compat_urllib_request.Request(self._DISCLAIMER)
554         try:
555             self.report_disclaimer()
556             disclaimer = compat_urllib_request.urlopen(request).read()
557         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
558             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
559             return
560
561         # Confirm age
562         disclaimer_form = {
563             'filters': '0',
564             'submit': "Continue - I'm over 18",
565             }
566         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
567         try:
568             self.report_age_confirmation()
569             disclaimer = compat_urllib_request.urlopen(request).read()
570         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
571             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
572             return
573
574     def _real_extract(self, url):
575         # Extract id and simplified title from URL
576         mobj = re.match(self._VALID_URL, url)
577         if mobj is None:
578             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
579             return
580
581         video_id = mobj.group(1)
582
583         # Check if video comes from YouTube
584         mobj2 = re.match(r'^yt-(.*)$', video_id)
585         if mobj2 is not None:
586             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
587             return
588
589         # Retrieve video webpage to extract further information
590         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
591         try:
592             self.report_download_webpage(video_id)
593             webpage = compat_urllib_request.urlopen(request).read()
594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
596             return
597
598         # Extract URL, uploader and title from webpage
599         self.report_extraction(video_id)
600         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
601         if mobj is not None:
602             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
603             video_extension = mediaURL[-3:]
604
605             # Extract gdaKey if available
606             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
607             if mobj is None:
608                 video_url = mediaURL
609             else:
610                 gdaKey = mobj.group(1)
611                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
612         else:
613             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
614             if mobj is None:
615                 self._downloader.trouble(u'ERROR: unable to extract media URL')
616                 return
617             vardict = compat_parse_qs(mobj.group(1))
618             if 'mediaData' not in vardict:
619                 self._downloader.trouble(u'ERROR: unable to extract media URL')
620                 return
621             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
622             if mobj is None:
623                 self._downloader.trouble(u'ERROR: unable to extract media URL')
624                 return
625             mediaURL = mobj.group(1).replace('\\/', '/')
626             video_extension = mediaURL[-3:]
627             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
628
629         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
630         if mobj is None:
631             self._downloader.trouble(u'ERROR: unable to extract title')
632             return
633         video_title = mobj.group(1).decode('utf-8')
634
635         mobj = re.search(r'submitter=(.*?);', webpage)
636         if mobj is None:
637             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
638             return
639         video_uploader = mobj.group(1)
640
641         return [{
642             'id':       video_id.decode('utf-8'),
643             'url':      video_url.decode('utf-8'),
644             'uploader': video_uploader.decode('utf-8'),
645             'upload_date':  None,
646             'title':    video_title,
647             'ext':      video_extension.decode('utf-8'),
648         }]
649
650
651 class DailymotionIE(InfoExtractor):
652     """Information Extractor for Dailymotion"""
653
654     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
655     IE_NAME = u'dailymotion'
656
657     def __init__(self, downloader=None):
658         InfoExtractor.__init__(self, downloader)
659
660     def report_download_webpage(self, video_id):
661         """Report webpage download."""
662         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
663
664     def report_extraction(self, video_id):
665         """Report information extraction."""
666         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
667
668     def _real_extract(self, url):
669         # Extract id and simplified title from URL
670         mobj = re.match(self._VALID_URL, url)
671         if mobj is None:
672             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
673             return
674
675         video_id = mobj.group(1).split('_')[0].split('?')[0]
676
677         video_extension = 'mp4'
678
679         # Retrieve video webpage to extract further information
680         request = compat_urllib_request.Request(url)
681         request.add_header('Cookie', 'family_filter=off')
682         try:
683             self.report_download_webpage(video_id)
684             webpage_bytes = compat_urllib_request.urlopen(request).read()
685             webpage = webpage_bytes.decode('utf-8')
686         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
687             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
688             return
689
690         # Extract URL, uploader and title from webpage
691         self.report_extraction(video_id)
692         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
693         if mobj is None:
694             self._downloader.trouble(u'ERROR: unable to extract media URL')
695             return
696         flashvars = compat_urllib_parse.unquote(mobj.group(1))
697
698         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
699             if key in flashvars:
700                 max_quality = key
701                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
702                 break
703         else:
704             self._downloader.trouble(u'ERROR: unable to extract video URL')
705             return
706
707         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
708         if mobj is None:
709             self._downloader.trouble(u'ERROR: unable to extract video URL')
710             return
711
712         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
713
714         # TODO: support choosing qualities
715
716         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
717         if mobj is None:
718             self._downloader.trouble(u'ERROR: unable to extract title')
719             return
720         video_title = unescapeHTML(mobj.group('title'))
721
722         video_uploader = None
723         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
724         if mobj is None:
725             # lookin for official user
726             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
727             if mobj_official is None:
728                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
729             else:
730                 video_uploader = mobj_official.group(1)
731         else:
732             video_uploader = mobj.group(1)
733
734         video_upload_date = None
735         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
736         if mobj is not None:
737             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
738
739         return [{
740             'id':       video_id,
741             'url':      video_url,
742             'uploader': video_uploader,
743             'upload_date':  video_upload_date,
744             'title':    video_title,
745             'ext':      video_extension,
746         }]
747
748
749 class PhotobucketIE(InfoExtractor):
750     """Information extractor for photobucket.com."""
751
752     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
753     IE_NAME = u'photobucket'
754
755     def __init__(self, downloader=None):
756         InfoExtractor.__init__(self, downloader)
757
758     def report_download_webpage(self, video_id):
759         """Report webpage download."""
760         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
761
762     def report_extraction(self, video_id):
763         """Report information extraction."""
764         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
765
766     def _real_extract(self, url):
767         # Extract id from URL
768         mobj = re.match(self._VALID_URL, url)
769         if mobj is None:
770             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
771             return
772
773         video_id = mobj.group(1)
774
775         video_extension = 'flv'
776
777         # Retrieve video webpage to extract further information
778         request = compat_urllib_request.Request(url)
779         try:
780             self.report_download_webpage(video_id)
781             webpage = compat_urllib_request.urlopen(request).read()
782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
783             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
784             return
785
786         # Extract URL, uploader, and title from webpage
787         self.report_extraction(video_id)
788         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
789         if mobj is None:
790             self._downloader.trouble(u'ERROR: unable to extract media URL')
791             return
792         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
793
794         video_url = mediaURL
795
796         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
797         if mobj is None:
798             self._downloader.trouble(u'ERROR: unable to extract title')
799             return
800         video_title = mobj.group(1).decode('utf-8')
801
802         video_uploader = mobj.group(2).decode('utf-8')
803
804         return [{
805             'id':       video_id.decode('utf-8'),
806             'url':      video_url.decode('utf-8'),
807             'uploader': video_uploader,
808             'upload_date':  None,
809             'title':    video_title,
810             'ext':      video_extension.decode('utf-8'),
811         }]
812
813
814 class YahooIE(InfoExtractor):
815     """Information extractor for video.yahoo.com."""
816
817     _WORKING = False
818     # _VALID_URL matches all Yahoo! Video URLs
819     # _VPAGE_URL matches only the extractable '/watch/' URLs
820     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
821     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
822     IE_NAME = u'video.yahoo'
823
824     def __init__(self, downloader=None):
825         InfoExtractor.__init__(self, downloader)
826
827     def report_download_webpage(self, video_id):
828         """Report webpage download."""
829         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
830
831     def report_extraction(self, video_id):
832         """Report information extraction."""
833         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
834
835     def _real_extract(self, url, new_video=True):
836         # Extract ID from URL
837         mobj = re.match(self._VALID_URL, url)
838         if mobj is None:
839             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
840             return
841
842         video_id = mobj.group(2)
843         video_extension = 'flv'
844
845         # Rewrite valid but non-extractable URLs as
846         # extractable English language /watch/ URLs
847         if re.match(self._VPAGE_URL, url) is None:
848             request = compat_urllib_request.Request(url)
849             try:
850                 webpage = compat_urllib_request.urlopen(request).read()
851             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
852                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
853                 return
854
855             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
856             if mobj is None:
857                 self._downloader.trouble(u'ERROR: Unable to extract id field')
858                 return
859             yahoo_id = mobj.group(1)
860
861             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
862             if mobj is None:
863                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
864                 return
865             yahoo_vid = mobj.group(1)
866
867             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
868             return self._real_extract(url, new_video=False)
869
870         # Retrieve video webpage to extract further information
871         request = compat_urllib_request.Request(url)
872         try:
873             self.report_download_webpage(video_id)
874             webpage = compat_urllib_request.urlopen(request).read()
875         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
876             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
877             return
878
879         # Extract uploader and title from webpage
880         self.report_extraction(video_id)
881         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
882         if mobj is None:
883             self._downloader.trouble(u'ERROR: unable to extract video title')
884             return
885         video_title = mobj.group(1).decode('utf-8')
886
887         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
888         if mobj is None:
889             self._downloader.trouble(u'ERROR: unable to extract video uploader')
890             return
891         video_uploader = mobj.group(1).decode('utf-8')
892
893         # Extract video thumbnail
894         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
895         if mobj is None:
896             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
897             return
898         video_thumbnail = mobj.group(1).decode('utf-8')
899
900         # Extract video description
901         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
902         if mobj is None:
903             self._downloader.trouble(u'ERROR: unable to extract video description')
904             return
905         video_description = mobj.group(1).decode('utf-8')
906         if not video_description:
907             video_description = 'No description available.'
908
909         # Extract video height and width
910         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
911         if mobj is None:
912             self._downloader.trouble(u'ERROR: unable to extract video height')
913             return
914         yv_video_height = mobj.group(1)
915
916         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
917         if mobj is None:
918             self._downloader.trouble(u'ERROR: unable to extract video width')
919             return
920         yv_video_width = mobj.group(1)
921
922         # Retrieve video playlist to extract media URL
923         # I'm not completely sure what all these options are, but we
924         # seem to need most of them, otherwise the server sends a 401.
925         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
926         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
927         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
928                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
929                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
930         try:
931             self.report_download_webpage(video_id)
932             webpage = compat_urllib_request.urlopen(request).read()
933         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
934             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
935             return
936
937         # Extract media URL from playlist XML
938         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
939         if mobj is None:
940             self._downloader.trouble(u'ERROR: Unable to extract media URL')
941             return
942         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
943         video_url = unescapeHTML(video_url)
944
945         return [{
946             'id':       video_id.decode('utf-8'),
947             'url':      video_url,
948             'uploader': video_uploader,
949             'upload_date':  None,
950             'title':    video_title,
951             'ext':      video_extension.decode('utf-8'),
952             'thumbnail':    video_thumbnail.decode('utf-8'),
953             'description':  video_description,
954         }]
955
956
957 class VimeoIE(InfoExtractor):
958     """Information extractor for vimeo.com."""
959
960     # _VALID_URL matches Vimeo URLs
961     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
962     IE_NAME = u'vimeo'
963
964     def __init__(self, downloader=None):
965         InfoExtractor.__init__(self, downloader)
966
967     def report_download_webpage(self, video_id):
968         """Report webpage download."""
969         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
970
971     def report_extraction(self, video_id):
972         """Report information extraction."""
973         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
974
975     def _real_extract(self, url, new_video=True):
976         # Extract ID from URL
977         mobj = re.match(self._VALID_URL, url)
978         if mobj is None:
979             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
980             return
981
982         video_id = mobj.group(1)
983
984         # Retrieve video webpage to extract further information
985         request = compat_urllib_request.Request(url, None, std_headers)
986         try:
987             self.report_download_webpage(video_id)
988             webpage_bytes = compat_urllib_request.urlopen(request).read()
989             webpage = webpage_bytes.decode('utf-8')
990         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
991             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
992             return
993
994         # Now we begin extracting as much information as we can from what we
995         # retrieved. First we extract the information common to all extractors,
996         # and latter we extract those that are Vimeo specific.
997         self.report_extraction(video_id)
998
999         # Extract the config JSON
1000         try:
1001             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1002             config = json.loads(config)
1003         except:
1004             self._downloader.trouble(u'ERROR: unable to extract info section')
1005             return
1006
1007         # Extract title
1008         video_title = config["video"]["title"]
1009
1010         # Extract uploader and uploader_id
1011         video_uploader = config["video"]["owner"]["name"]
1012         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1013
1014         # Extract video thumbnail
1015         video_thumbnail = config["video"]["thumbnail"]
1016
1017         # Extract video description
1018         video_description = get_element_by_attribute("itemprop", "description", webpage)
1019         if video_description: video_description = clean_html(video_description)
1020         else: video_description = ''
1021
1022         # Extract upload date
1023         video_upload_date = None
1024         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1025         if mobj is not None:
1026             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1027
1028         # Vimeo specific: extract request signature and timestamp
1029         sig = config['request']['signature']
1030         timestamp = config['request']['timestamp']
1031
1032         # Vimeo specific: extract video codec and quality information
1033         # First consider quality, then codecs, then take everything
1034         # TODO bind to format param
1035         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1036         files = { 'hd': [], 'sd': [], 'other': []}
1037         for codec_name, codec_extension in codecs:
1038             if codec_name in config["video"]["files"]:
1039                 if 'hd' in config["video"]["files"][codec_name]:
1040                     files['hd'].append((codec_name, codec_extension, 'hd'))
1041                 elif 'sd' in config["video"]["files"][codec_name]:
1042                     files['sd'].append((codec_name, codec_extension, 'sd'))
1043                 else:
1044                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1045
1046         for quality in ('hd', 'sd', 'other'):
1047             if len(files[quality]) > 0:
1048                 video_quality = files[quality][0][2]
1049                 video_codec = files[quality][0][0]
1050                 video_extension = files[quality][0][1]
1051                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1052                 break
1053         else:
1054             self._downloader.trouble(u'ERROR: no known codec found')
1055             return
1056
1057         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1058                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1059
1060         return [{
1061             'id':       video_id,
1062             'url':      video_url,
1063             'uploader': video_uploader,
1064             'uploader_id': video_uploader_id,
1065             'upload_date':  video_upload_date,
1066             'title':    video_title,
1067             'ext':      video_extension,
1068             'thumbnail':    video_thumbnail,
1069             'description':  video_description,
1070         }]
1071
1072
1073 class ArteTvIE(InfoExtractor):
1074     """arte.tv information extractor."""
1075
1076     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1077     _LIVE_URL = r'index-[0-9]+\.html$'
1078
1079     IE_NAME = u'arte.tv'
1080
1081     def __init__(self, downloader=None):
1082         InfoExtractor.__init__(self, downloader)
1083
1084     def report_download_webpage(self, video_id):
1085         """Report webpage download."""
1086         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1087
1088     def report_extraction(self, video_id):
1089         """Report information extraction."""
1090         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1091
1092     def fetch_webpage(self, url):
1093         self._downloader.increment_downloads()
1094         request = compat_urllib_request.Request(url)
1095         try:
1096             self.report_download_webpage(url)
1097             webpage = compat_urllib_request.urlopen(request).read()
1098         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1100             return
1101         except ValueError as err:
1102             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1103             return
1104         return webpage
1105
1106     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1107         page = self.fetch_webpage(url)
1108         mobj = re.search(regex, page, regexFlags)
1109         info = {}
1110
1111         if mobj is None:
1112             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1113             return
1114
1115         for (i, key, err) in matchTuples:
1116             if mobj.group(i) is None:
1117                 self._downloader.trouble(err)
1118                 return
1119             else:
1120                 info[key] = mobj.group(i)
1121
1122         return info
1123
1124     def extractLiveStream(self, url):
1125         video_lang = url.split('/')[-4]
1126         info = self.grep_webpage(
1127             url,
1128             r'src="(.*?/videothek_js.*?\.js)',
1129             0,
1130             [
1131                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1132             ]
1133         )
1134         http_host = url.split('/')[2]
1135         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1136         info = self.grep_webpage(
1137             next_url,
1138             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1139                 '(http://.*?\.swf).*?' +
1140                 '(rtmp://.*?)\'',
1141             re.DOTALL,
1142             [
1143                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1144                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1145                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1146             ]
1147         )
1148         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1149
1150     def extractPlus7Stream(self, url):
1151         video_lang = url.split('/')[-3]
1152         info = self.grep_webpage(
1153             url,
1154             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1155             0,
1156             [
1157                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1158             ]
1159         )
1160         next_url = compat_urllib_parse.unquote(info.get('url'))
1161         info = self.grep_webpage(
1162             next_url,
1163             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1164             0,
1165             [
1166                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1167             ]
1168         )
1169         next_url = compat_urllib_parse.unquote(info.get('url'))
1170
1171         info = self.grep_webpage(
1172             next_url,
1173             r'<video id="(.*?)".*?>.*?' +
1174                 '<name>(.*?)</name>.*?' +
1175                 '<dateVideo>(.*?)</dateVideo>.*?' +
1176                 '<url quality="hd">(.*?)</url>',
1177             re.DOTALL,
1178             [
1179                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1180                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1181                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1182                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1183             ]
1184         )
1185
1186         return {
1187             'id':           info.get('id'),
1188             'url':          compat_urllib_parse.unquote(info.get('url')),
1189             'uploader':     u'arte.tv',
1190             'upload_date':  info.get('date'),
1191             'title':        info.get('title').decode('utf-8'),
1192             'ext':          u'mp4',
1193             'format':       u'NA',
1194             'player_url':   None,
1195         }
1196
1197     def _real_extract(self, url):
1198         video_id = url.split('/')[-1]
1199         self.report_extraction(video_id)
1200
1201         if re.search(self._LIVE_URL, video_id) is not None:
1202             self.extractLiveStream(url)
1203             return
1204         else:
1205             info = self.extractPlus7Stream(url)
1206
1207         return [info]
1208
1209
1210 class GenericIE(InfoExtractor):
1211     """Generic last-resort information extractor."""
1212
1213     _VALID_URL = r'.*'
1214     IE_NAME = u'generic'
1215
1216     def __init__(self, downloader=None):
1217         InfoExtractor.__init__(self, downloader)
1218
1219     def report_download_webpage(self, video_id):
1220         """Report webpage download."""
1221         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1222         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1223
1224     def report_extraction(self, video_id):
1225         """Report information extraction."""
1226         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1227
1228     def report_following_redirect(self, new_url):
1229         """Report information extraction."""
1230         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1231
1232     def _test_redirect(self, url):
1233         """Check if it is a redirect, like url shorteners, in case restart chain."""
1234         class HeadRequest(compat_urllib_request.Request):
1235             def get_method(self):
1236                 return "HEAD"
1237
1238         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1239             """
1240             Subclass the HTTPRedirectHandler to make it use our
1241             HeadRequest also on the redirected URL
1242             """
1243             def redirect_request(self, req, fp, code, msg, headers, newurl):
1244                 if code in (301, 302, 303, 307):
1245                     newurl = newurl.replace(' ', '%20')
1246                     newheaders = dict((k,v) for k,v in req.headers.items()
1247                                       if k.lower() not in ("content-length", "content-type"))
1248                     return HeadRequest(newurl,
1249                                        headers=newheaders,
1250                                        origin_req_host=req.get_origin_req_host(),
1251                                        unverifiable=True)
1252                 else:
1253                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1254
1255         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1256             """
1257             Fallback to GET if HEAD is not allowed (405 HTTP error)
1258             """
1259             def http_error_405(self, req, fp, code, msg, headers):
1260                 fp.read()
1261                 fp.close()
1262
1263                 newheaders = dict((k,v) for k,v in req.headers.items()
1264                                   if k.lower() not in ("content-length", "content-type"))
1265                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1266                                                  headers=newheaders,
1267                                                  origin_req_host=req.get_origin_req_host(),
1268                                                  unverifiable=True))
1269
1270         # Build our opener
1271         opener = compat_urllib_request.OpenerDirector()
1272         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1273                         HTTPMethodFallback, HEADRedirectHandler,
1274                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1275             opener.add_handler(handler())
1276
1277         response = opener.open(HeadRequest(url))
1278         new_url = response.geturl()
1279
1280         if url == new_url:
1281             return False
1282
1283         self.report_following_redirect(new_url)
1284         self._downloader.download([new_url])
1285         return True
1286
1287     def _real_extract(self, url):
1288         if self._test_redirect(url): return
1289
1290         video_id = url.split('/')[-1]
1291         request = compat_urllib_request.Request(url)
1292         try:
1293             self.report_download_webpage(video_id)
1294             webpage = compat_urllib_request.urlopen(request).read()
1295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1296             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1297             return
1298         except ValueError as err:
1299             # since this is the last-resort InfoExtractor, if
1300             # this error is thrown, it'll be thrown here
1301             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1302             return
1303
1304         self.report_extraction(video_id)
1305         # Start with something easy: JW Player in SWFObject
1306         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1307         if mobj is None:
1308             # Broaden the search a little bit
1309             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1310         if mobj is None:
1311             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1312             return
1313
1314         # It's possible that one of the regexes
1315         # matched, but returned an empty group:
1316         if mobj.group(1) is None:
1317             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318             return
1319
1320         video_url = compat_urllib_parse.unquote(mobj.group(1))
1321         video_id = os.path.basename(video_url)
1322
1323         # here's a fun little line of code for you:
1324         video_extension = os.path.splitext(video_id)[1][1:]
1325         video_id = os.path.splitext(video_id)[0]
1326
1327         # it's tempting to parse this further, but you would
1328         # have to take into account all the variations like
1329         #   Video Title - Site Name
1330         #   Site Name | Video Title
1331         #   Video Title - Tagline | Site Name
1332         # and so on and so forth; it's just not practical
1333         mobj = re.search(r'<title>(.*)</title>', webpage)
1334         if mobj is None:
1335             self._downloader.trouble(u'ERROR: unable to extract title')
1336             return
1337         video_title = mobj.group(1)
1338
1339         # video uploader is domain name
1340         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1341         if mobj is None:
1342             self._downloader.trouble(u'ERROR: unable to extract title')
1343             return
1344         video_uploader = mobj.group(1)
1345
1346         return [{
1347             'id':       video_id,
1348             'url':      video_url,
1349             'uploader': video_uploader,
1350             'upload_date':  None,
1351             'title':    video_title,
1352             'ext':      video_extension,
1353         }]
1354
1355
1356 class YoutubeSearchIE(InfoExtractor):
1357     """Information Extractor for YouTube search queries."""
1358     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1359     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1360     _max_youtube_results = 1000
1361     IE_NAME = u'youtube:search'
1362
1363     def __init__(self, downloader=None):
1364         InfoExtractor.__init__(self, downloader)
1365
1366     def report_download_page(self, query, pagenum):
1367         """Report attempt to download search page with given number."""
1368         query = query.decode(preferredencoding())
1369         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1370
1371     def _real_extract(self, query):
1372         mobj = re.match(self._VALID_URL, query)
1373         if mobj is None:
1374             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1375             return
1376
1377         prefix, query = query.split(':')
1378         prefix = prefix[8:]
1379         query = query.encode('utf-8')
1380         if prefix == '':
1381             self._download_n_results(query, 1)
1382             return
1383         elif prefix == 'all':
1384             self._download_n_results(query, self._max_youtube_results)
1385             return
1386         else:
1387             try:
1388                 n = int(prefix)
1389                 if n <= 0:
1390                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1391                     return
1392                 elif n > self._max_youtube_results:
1393                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1394                     n = self._max_youtube_results
1395                 self._download_n_results(query, n)
1396                 return
1397             except ValueError: # parsing prefix as integer fails
1398                 self._download_n_results(query, 1)
1399                 return
1400
1401     def _download_n_results(self, query, n):
1402         """Downloads a specified number of results for a query"""
1403
1404         video_ids = []
1405         pagenum = 0
1406         limit = n
1407
1408         while (50 * pagenum) < limit:
1409             self.report_download_page(query, pagenum+1)
1410             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1411             request = compat_urllib_request.Request(result_url)
1412             try:
1413                 data = compat_urllib_request.urlopen(request).read()
1414             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1415                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1416                 return
1417             api_response = json.loads(data)['data']
1418
1419             new_ids = list(video['id'] for video in api_response['items'])
1420             video_ids += new_ids
1421
1422             limit = min(n, api_response['totalItems'])
1423             pagenum += 1
1424
1425         if len(video_ids) > n:
1426             video_ids = video_ids[:n]
1427         for id in video_ids:
1428             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1429         return
1430
1431
1432 class GoogleSearchIE(InfoExtractor):
1433     """Information Extractor for Google Video search queries."""
1434     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1435     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1436     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1437     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1438     _max_google_results = 1000
1439     IE_NAME = u'video.google:search'
1440
1441     def __init__(self, downloader=None):
1442         InfoExtractor.__init__(self, downloader)
1443
1444     def report_download_page(self, query, pagenum):
1445         """Report attempt to download playlist page with given number."""
1446         query = query.decode(preferredencoding())
1447         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1448
1449     def _real_extract(self, query):
1450         mobj = re.match(self._VALID_URL, query)
1451         if mobj is None:
1452             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1453             return
1454
1455         prefix, query = query.split(':')
1456         prefix = prefix[8:]
1457         query = query.encode('utf-8')
1458         if prefix == '':
1459             self._download_n_results(query, 1)
1460             return
1461         elif prefix == 'all':
1462             self._download_n_results(query, self._max_google_results)
1463             return
1464         else:
1465             try:
1466                 n = int(prefix)
1467                 if n <= 0:
1468                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1469                     return
1470                 elif n > self._max_google_results:
1471                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1472                     n = self._max_google_results
1473                 self._download_n_results(query, n)
1474                 return
1475             except ValueError: # parsing prefix as integer fails
1476                 self._download_n_results(query, 1)
1477                 return
1478
1479     def _download_n_results(self, query, n):
1480         """Downloads a specified number of results for a query"""
1481
1482         video_ids = []
1483         pagenum = 0
1484
1485         while True:
1486             self.report_download_page(query, pagenum)
1487             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1488             request = compat_urllib_request.Request(result_url)
1489             try:
1490                 page = compat_urllib_request.urlopen(request).read()
1491             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1492                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1493                 return
1494
1495             # Extract video identifiers
1496             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1497                 video_id = mobj.group(1)
1498                 if video_id not in video_ids:
1499                     video_ids.append(video_id)
1500                     if len(video_ids) == n:
1501                         # Specified n videos reached
1502                         for id in video_ids:
1503                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1504                         return
1505
1506             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1507                 for id in video_ids:
1508                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1509                 return
1510
1511             pagenum = pagenum + 1
1512
1513
1514 class YahooSearchIE(InfoExtractor):
1515     """Information Extractor for Yahoo! Video search queries."""
1516
1517     _WORKING = False
1518     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1519     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1520     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1521     _MORE_PAGES_INDICATOR = r'\s*Next'
1522     _max_yahoo_results = 1000
1523     IE_NAME = u'video.yahoo:search'
1524
1525     def __init__(self, downloader=None):
1526         InfoExtractor.__init__(self, downloader)
1527
1528     def report_download_page(self, query, pagenum):
1529         """Report attempt to download playlist page with given number."""
1530         query = query.decode(preferredencoding())
1531         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1532
1533     def _real_extract(self, query):
1534         mobj = re.match(self._VALID_URL, query)
1535         if mobj is None:
1536             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1537             return
1538
1539         prefix, query = query.split(':')
1540         prefix = prefix[8:]
1541         query = query.encode('utf-8')
1542         if prefix == '':
1543             self._download_n_results(query, 1)
1544             return
1545         elif prefix == 'all':
1546             self._download_n_results(query, self._max_yahoo_results)
1547             return
1548         else:
1549             try:
1550                 n = int(prefix)
1551                 if n <= 0:
1552                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1553                     return
1554                 elif n > self._max_yahoo_results:
1555                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1556                     n = self._max_yahoo_results
1557                 self._download_n_results(query, n)
1558                 return
1559             except ValueError: # parsing prefix as integer fails
1560                 self._download_n_results(query, 1)
1561                 return
1562
1563     def _download_n_results(self, query, n):
1564         """Downloads a specified number of results for a query"""
1565
1566         video_ids = []
1567         already_seen = set()
1568         pagenum = 1
1569
1570         while True:
1571             self.report_download_page(query, pagenum)
1572             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1573             request = compat_urllib_request.Request(result_url)
1574             try:
1575                 page = compat_urllib_request.urlopen(request).read()
1576             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1577                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1578                 return
1579
1580             # Extract video identifiers
1581             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1582                 video_id = mobj.group(1)
1583                 if video_id not in already_seen:
1584                     video_ids.append(video_id)
1585                     already_seen.add(video_id)
1586                     if len(video_ids) == n:
1587                         # Specified n videos reached
1588                         for id in video_ids:
1589                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1590                         return
1591
1592             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1593                 for id in video_ids:
1594                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1595                 return
1596
1597             pagenum = pagenum + 1
1598
1599
1600 class YoutubePlaylistIE(InfoExtractor):
1601     """Information Extractor for YouTube playlists."""
1602
1603     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1604     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1605     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1606     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1607     IE_NAME = u'youtube:playlist'
1608
1609     def __init__(self, downloader=None):
1610         InfoExtractor.__init__(self, downloader)
1611
1612     def report_download_page(self, playlist_id, pagenum):
1613         """Report attempt to download playlist page with given number."""
1614         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1615
1616     def _real_extract(self, url):
1617         # Extract playlist id
1618         mobj = re.match(self._VALID_URL, url)
1619         if mobj is None:
1620             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1621             return
1622
1623         # Single video case
1624         if mobj.group(3) is not None:
1625             self._downloader.download([mobj.group(3)])
1626             return
1627
1628         # Download playlist pages
1629         # prefix is 'p' as default for playlists but there are other types that need extra care
1630         playlist_prefix = mobj.group(1)
1631         if playlist_prefix == 'a':
1632             playlist_access = 'artist'
1633         else:
1634             playlist_prefix = 'p'
1635             playlist_access = 'view_play_list'
1636         playlist_id = mobj.group(2)
1637         video_ids = []
1638         pagenum = 1
1639
1640         while True:
1641             self.report_download_page(playlist_id, pagenum)
1642             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1643             request = compat_urllib_request.Request(url)
1644             try:
1645                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1646             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1648                 return
1649
1650             # Extract video identifiers
1651             ids_in_page = []
1652             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1653                 if mobj.group(1) not in ids_in_page:
1654                     ids_in_page.append(mobj.group(1))
1655             video_ids.extend(ids_in_page)
1656
1657             if self._MORE_PAGES_INDICATOR not in page:
1658                 break
1659             pagenum = pagenum + 1
1660
1661         total = len(video_ids)
1662
1663         playliststart = self._downloader.params.get('playliststart', 1) - 1
1664         playlistend = self._downloader.params.get('playlistend', -1)
1665         if playlistend == -1:
1666             video_ids = video_ids[playliststart:]
1667         else:
1668             video_ids = video_ids[playliststart:playlistend]
1669
1670         if len(video_ids) == total:
1671             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1672         else:
1673             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1674
1675         for id in video_ids:
1676             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1677         return
1678
1679
1680 class YoutubeChannelIE(InfoExtractor):
1681     """Information Extractor for YouTube channels."""
1682
1683     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1684     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1685     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1686     IE_NAME = u'youtube:channel'
1687
1688     def report_download_page(self, channel_id, pagenum):
1689         """Report attempt to download channel page with given number."""
1690         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1691
1692     def _real_extract(self, url):
1693         # Extract channel id
1694         mobj = re.match(self._VALID_URL, url)
1695         if mobj is None:
1696             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1697             return
1698
1699         # Download channel pages
1700         channel_id = mobj.group(1)
1701         video_ids = []
1702         pagenum = 1
1703
1704         while True:
1705             self.report_download_page(channel_id, pagenum)
1706             url = self._TEMPLATE_URL % (channel_id, pagenum)
1707             request = compat_urllib_request.Request(url)
1708             try:
1709                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1710             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1711                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1712                 return
1713
1714             # Extract video identifiers
1715             ids_in_page = []
1716             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1717                 if mobj.group(1) not in ids_in_page:
1718                     ids_in_page.append(mobj.group(1))
1719             video_ids.extend(ids_in_page)
1720
1721             if self._MORE_PAGES_INDICATOR not in page:
1722                 break
1723             pagenum = pagenum + 1
1724
1725         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1726
1727         for id in video_ids:
1728             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1729         return
1730
1731
1732 class YoutubeUserIE(InfoExtractor):
1733     """Information Extractor for YouTube users."""
1734
1735     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1736     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1737     _GDATA_PAGE_SIZE = 50
1738     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1739     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1740     IE_NAME = u'youtube:user'
1741
1742     def __init__(self, downloader=None):
1743         InfoExtractor.__init__(self, downloader)
1744
1745     def report_download_page(self, username, start_index):
1746         """Report attempt to download user page."""
1747         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1748                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1749
1750     def _real_extract(self, url):
1751         # Extract username
1752         mobj = re.match(self._VALID_URL, url)
1753         if mobj is None:
1754             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1755             return
1756
1757         username = mobj.group(1)
1758
1759         # Download video ids using YouTube Data API. Result size per
1760         # query is limited (currently to 50 videos) so we need to query
1761         # page by page until there are no video ids - it means we got
1762         # all of them.
1763
1764         video_ids = []
1765         pagenum = 0
1766
1767         while True:
1768             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1769             self.report_download_page(username, start_index)
1770
1771             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1772
1773             try:
1774                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1775             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1776                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1777                 return
1778
1779             # Extract video identifiers
1780             ids_in_page = []
1781
1782             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1783                 if mobj.group(1) not in ids_in_page:
1784                     ids_in_page.append(mobj.group(1))
1785
1786             video_ids.extend(ids_in_page)
1787
1788             # A little optimization - if current page is not
1789             # "full", ie. does not contain PAGE_SIZE video ids then
1790             # we can assume that this page is the last one - there
1791             # are no more ids on further pages - no need to query
1792             # again.
1793
1794             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1795                 break
1796
1797             pagenum += 1
1798
1799         all_ids_count = len(video_ids)
1800         playliststart = self._downloader.params.get('playliststart', 1) - 1
1801         playlistend = self._downloader.params.get('playlistend', -1)
1802
1803         if playlistend == -1:
1804             video_ids = video_ids[playliststart:]
1805         else:
1806             video_ids = video_ids[playliststart:playlistend]
1807
1808         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1809                 (username, all_ids_count, len(video_ids)))
1810
1811         for video_id in video_ids:
1812             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1813
1814
1815 class BlipTVUserIE(InfoExtractor):
1816     """Information Extractor for blip.tv users."""
1817
1818     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1819     _PAGE_SIZE = 12
1820     IE_NAME = u'blip.tv:user'
1821
1822     def __init__(self, downloader=None):
1823         InfoExtractor.__init__(self, downloader)
1824
1825     def report_download_page(self, username, pagenum):
1826         """Report attempt to download user page."""
1827         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1828                 (self.IE_NAME, username, pagenum))
1829
1830     def _real_extract(self, url):
1831         # Extract username
1832         mobj = re.match(self._VALID_URL, url)
1833         if mobj is None:
1834             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1835             return
1836
1837         username = mobj.group(1)
1838
1839         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1840
1841         request = compat_urllib_request.Request(url)
1842
1843         try:
1844             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1845             mobj = re.search(r'data-users-id="([^"]+)"', page)
1846             page_base = page_base % mobj.group(1)
1847         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1849             return
1850
1851
1852         # Download video ids using BlipTV Ajax calls. Result size per
1853         # query is limited (currently to 12 videos) so we need to query
1854         # page by page until there are no video ids - it means we got
1855         # all of them.
1856
1857         video_ids = []
1858         pagenum = 1
1859
1860         while True:
1861             self.report_download_page(username, pagenum)
1862
1863             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1864
1865             try:
1866                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1867             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1868                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1869                 return
1870
1871             # Extract video identifiers
1872             ids_in_page = []
1873
1874             for mobj in re.finditer(r'href="/([^"]+)"', page):
1875                 if mobj.group(1) not in ids_in_page:
1876                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1877
1878             video_ids.extend(ids_in_page)
1879
1880             # A little optimization - if current page is not
1881             # "full", ie. does not contain PAGE_SIZE video ids then
1882             # we can assume that this page is the last one - there
1883             # are no more ids on further pages - no need to query
1884             # again.
1885
1886             if len(ids_in_page) < self._PAGE_SIZE:
1887                 break
1888
1889             pagenum += 1
1890
1891         all_ids_count = len(video_ids)
1892         playliststart = self._downloader.params.get('playliststart', 1) - 1
1893         playlistend = self._downloader.params.get('playlistend', -1)
1894
1895         if playlistend == -1:
1896             video_ids = video_ids[playliststart:]
1897         else:
1898             video_ids = video_ids[playliststart:playlistend]
1899
1900         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1901                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1902
1903         for video_id in video_ids:
1904             self._downloader.download([u'http://blip.tv/'+video_id])
1905
1906
1907 class DepositFilesIE(InfoExtractor):
1908     """Information extractor for depositfiles.com"""
1909
1910     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1911     IE_NAME = u'DepositFiles'
1912
1913     def __init__(self, downloader=None):
1914         InfoExtractor.__init__(self, downloader)
1915
1916     def report_download_webpage(self, file_id):
1917         """Report webpage download."""
1918         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1919
1920     def report_extraction(self, file_id):
1921         """Report information extraction."""
1922         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1923
1924     def _real_extract(self, url):
1925         file_id = url.split('/')[-1]
1926         # Rebuild url in english locale
1927         url = 'http://depositfiles.com/en/files/' + file_id
1928
1929         # Retrieve file webpage with 'Free download' button pressed
1930         free_download_indication = { 'gateway_result' : '1' }
1931         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1932         try:
1933             self.report_download_webpage(file_id)
1934             webpage = compat_urllib_request.urlopen(request).read()
1935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1936             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1937             return
1938
1939         # Search for the real file URL
1940         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1941         if (mobj is None) or (mobj.group(1) is None):
1942             # Try to figure out reason of the error.
1943             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1944             if (mobj is not None) and (mobj.group(1) is not None):
1945                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1946                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1947             else:
1948                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1949             return
1950
1951         file_url = mobj.group(1)
1952         file_extension = os.path.splitext(file_url)[1][1:]
1953
1954         # Search for file title
1955         mobj = re.search(r'<b title="(.*?)">', webpage)
1956         if mobj is None:
1957             self._downloader.trouble(u'ERROR: unable to extract title')
1958             return
1959         file_title = mobj.group(1).decode('utf-8')
1960
1961         return [{
1962             'id':       file_id.decode('utf-8'),
1963             'url':      file_url.decode('utf-8'),
1964             'uploader': None,
1965             'upload_date':  None,
1966             'title':    file_title,
1967             'ext':      file_extension.decode('utf-8'),
1968         }]
1969
1970
1971 class FacebookIE(InfoExtractor):
1972     """Information Extractor for Facebook"""
1973
1974     _WORKING = False
1975     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1976     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1977     _NETRC_MACHINE = 'facebook'
1978     _available_formats = ['video', 'highqual', 'lowqual']
1979     _video_extensions = {
1980         'video': 'mp4',
1981         'highqual': 'mp4',
1982         'lowqual': 'mp4',
1983     }
1984     IE_NAME = u'facebook'
1985
1986     def __init__(self, downloader=None):
1987         InfoExtractor.__init__(self, downloader)
1988
1989     def _reporter(self, message):
1990         """Add header and report message."""
1991         self._downloader.to_screen(u'[facebook] %s' % message)
1992
1993     def report_login(self):
1994         """Report attempt to log in."""
1995         self._reporter(u'Logging in')
1996
1997     def report_video_webpage_download(self, video_id):
1998         """Report attempt to download video webpage."""
1999         self._reporter(u'%s: Downloading video webpage' % video_id)
2000
2001     def report_information_extraction(self, video_id):
2002         """Report attempt to extract video information."""
2003         self._reporter(u'%s: Extracting video information' % video_id)
2004
2005     def _parse_page(self, video_webpage):
2006         """Extract video information from page"""
2007         # General data
2008         data = {'title': r'\("video_title", "(.*?)"\)',
2009             'description': r'<div class="datawrap">(.*?)</div>',
2010             'owner': r'\("video_owner_name", "(.*?)"\)',
2011             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2012             }
2013         video_info = {}
2014         for piece in data.keys():
2015             mobj = re.search(data[piece], video_webpage)
2016             if mobj is not None:
2017                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2018
2019         # Video urls
2020         video_urls = {}
2021         for fmt in self._available_formats:
2022             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2023             if mobj is not None:
2024                 # URL is in a Javascript segment inside an escaped Unicode format within
2025                 # the generally utf-8 page
2026                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2027         video_info['video_urls'] = video_urls
2028
2029         return video_info
2030
2031     def _real_initialize(self):
2032         if self._downloader is None:
2033             return
2034
2035         useremail = None
2036         password = None
2037         downloader_params = self._downloader.params
2038
2039         # Attempt to use provided username and password or .netrc data
2040         if downloader_params.get('username', None) is not None:
2041             useremail = downloader_params['username']
2042             password = downloader_params['password']
2043         elif downloader_params.get('usenetrc', False):
2044             try:
2045                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2046                 if info is not None:
2047                     useremail = info[0]
2048                     password = info[2]
2049                 else:
2050                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2051             except (IOError, netrc.NetrcParseError) as err:
2052                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2053                 return
2054
2055         if useremail is None:
2056             return
2057
2058         # Log in
2059         login_form = {
2060             'email': useremail,
2061             'pass': password,
2062             'login': 'Log+In'
2063             }
2064         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2065         try:
2066             self.report_login()
2067             login_results = compat_urllib_request.urlopen(request).read()
2068             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2069                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2070                 return
2071         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2073             return
2074
2075     def _real_extract(self, url):
2076         mobj = re.match(self._VALID_URL, url)
2077         if mobj is None:
2078             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2079             return
2080         video_id = mobj.group('ID')
2081
2082         # Get video webpage
2083         self.report_video_webpage_download(video_id)
2084         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2085         try:
2086             page = compat_urllib_request.urlopen(request)
2087             video_webpage = page.read()
2088         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2089             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2090             return
2091
2092         # Start extracting information
2093         self.report_information_extraction(video_id)
2094
2095         # Extract information
2096         video_info = self._parse_page(video_webpage)
2097
2098         # uploader
2099         if 'owner' not in video_info:
2100             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2101             return
2102         video_uploader = video_info['owner']
2103
2104         # title
2105         if 'title' not in video_info:
2106             self._downloader.trouble(u'ERROR: unable to extract video title')
2107             return
2108         video_title = video_info['title']
2109         video_title = video_title.decode('utf-8')
2110
2111         # thumbnail image
2112         if 'thumbnail' not in video_info:
2113             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2114             video_thumbnail = ''
2115         else:
2116             video_thumbnail = video_info['thumbnail']
2117
2118         # upload date
2119         upload_date = None
2120         if 'upload_date' in video_info:
2121             upload_time = video_info['upload_date']
2122             timetuple = email.utils.parsedate_tz(upload_time)
2123             if timetuple is not None:
2124                 try:
2125                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2126                 except:
2127                     pass
2128
2129         # description
2130         video_description = video_info.get('description', 'No description available.')
2131
2132         url_map = video_info['video_urls']
2133         if url_map:
2134             # Decide which formats to download
2135             req_format = self._downloader.params.get('format', None)
2136             format_limit = self._downloader.params.get('format_limit', None)
2137
2138             if format_limit is not None and format_limit in self._available_formats:
2139                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2140             else:
2141                 format_list = self._available_formats
2142             existing_formats = [x for x in format_list if x in url_map]
2143             if len(existing_formats) == 0:
2144                 self._downloader.trouble(u'ERROR: no known formats available for video')
2145                 return
2146             if req_format is None:
2147                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2148             elif req_format == 'worst':
2149                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2150             elif req_format == '-1':
2151                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2152             else:
2153                 # Specific format
2154                 if req_format not in url_map:
2155                     self._downloader.trouble(u'ERROR: requested format not available')
2156                     return
2157                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2158
2159         results = []
2160         for format_param, video_real_url in video_url_list:
2161             # Extension
2162             video_extension = self._video_extensions.get(format_param, 'mp4')
2163
2164             results.append({
2165                 'id':       video_id.decode('utf-8'),
2166                 'url':      video_real_url.decode('utf-8'),
2167                 'uploader': video_uploader.decode('utf-8'),
2168                 'upload_date':  upload_date,
2169                 'title':    video_title,
2170                 'ext':      video_extension.decode('utf-8'),
2171                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2172                 'thumbnail':    video_thumbnail.decode('utf-8'),
2173                 'description':  video_description.decode('utf-8'),
2174             })
2175         return results
2176
2177 class BlipTVIE(InfoExtractor):
2178     """Information extractor for blip.tv"""
2179
2180     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2181     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2182     IE_NAME = u'blip.tv'
2183
2184     def report_extraction(self, file_id):
2185         """Report information extraction."""
2186         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2187
2188     def report_direct_download(self, title):
2189         """Report information extraction."""
2190         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2191
2192     def _real_extract(self, url):
2193         mobj = re.match(self._VALID_URL, url)
2194         if mobj is None:
2195             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2196             return
2197
2198         if '?' in url:
2199             cchar = '&'
2200         else:
2201             cchar = '?'
2202         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2203         request = compat_urllib_request.Request(json_url)
2204         self.report_extraction(mobj.group(1))
2205         info = None
2206         try:
2207             urlh = compat_urllib_request.urlopen(request)
2208             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2209                 basename = url.split('/')[-1]
2210                 title,ext = os.path.splitext(basename)
2211                 title = title.decode('UTF-8')
2212                 ext = ext.replace('.', '')
2213                 self.report_direct_download(title)
2214                 info = {
2215                     'id': title,
2216                     'url': url,
2217                     'uploader': None,
2218                     'upload_date': None,
2219                     'title': title,
2220                     'ext': ext,
2221                     'urlhandle': urlh
2222                 }
2223         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2224             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2225             return
2226         if info is None: # Regular URL
2227             try:
2228                 json_code_bytes = urlh.read()
2229                 json_code = json_code_bytes.decode('utf-8')
2230             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2232                 return
2233
2234             try:
2235                 json_data = json.loads(json_code)
2236                 if 'Post' in json_data:
2237                     data = json_data['Post']
2238                 else:
2239                     data = json_data
2240
2241                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2242                 video_url = data['media']['url']
2243                 umobj = re.match(self._URL_EXT, video_url)
2244                 if umobj is None:
2245                     raise ValueError('Can not determine filename extension')
2246                 ext = umobj.group(1)
2247
2248                 info = {
2249                     'id': data['item_id'],
2250                     'url': video_url,
2251                     'uploader': data['display_name'],
2252                     'upload_date': upload_date,
2253                     'title': data['title'],
2254                     'ext': ext,
2255                     'format': data['media']['mimeType'],
2256                     'thumbnail': data['thumbnailUrl'],
2257                     'description': data['description'],
2258                     'player_url': data['embedUrl']
2259                 }
2260             except (ValueError,KeyError) as err:
2261                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2262                 return
2263
2264         std_headers['User-Agent'] = 'iTunes/10.6.1'
2265         return [info]
2266
2267
2268 class MyVideoIE(InfoExtractor):
2269     """Information Extractor for myvideo.de."""
2270
2271     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2272     IE_NAME = u'myvideo'
2273
2274     def __init__(self, downloader=None):
2275         InfoExtractor.__init__(self, downloader)
2276
2277     def report_download_webpage(self, video_id):
2278         """Report webpage download."""
2279         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2280
2281     def report_extraction(self, video_id):
2282         """Report information extraction."""
2283         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2284
2285     def _real_extract(self,url):
2286         mobj = re.match(self._VALID_URL, url)
2287         if mobj is None:
2288             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2289             return
2290
2291         video_id = mobj.group(1)
2292
2293         # Get video webpage
2294         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2295         try:
2296             self.report_download_webpage(video_id)
2297             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2298         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2299             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2300             return
2301
2302         self.report_extraction(video_id)
2303         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2304                  webpage)
2305         if mobj is None:
2306             self._downloader.trouble(u'ERROR: unable to extract media URL')
2307             return
2308         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2309
2310         mobj = re.search('<title>([^<]+)</title>', webpage)
2311         if mobj is None:
2312             self._downloader.trouble(u'ERROR: unable to extract title')
2313             return
2314
2315         video_title = mobj.group(1)
2316
2317         return [{
2318             'id':       video_id,
2319             'url':      video_url,
2320             'uploader': None,
2321             'upload_date':  None,
2322             'title':    video_title,
2323             'ext':      u'flv',
2324         }]
2325
2326 class ComedyCentralIE(InfoExtractor):
2327     """Information extractor for The Daily Show and Colbert Report """
2328
2329     # urls can be abbreviations like :thedailyshow or :colbert
2330     # urls for episodes like:
2331     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2332     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2333     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2334     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2335                       |(https?://)?(www\.)?
2336                           (?P<showname>thedailyshow|colbertnation)\.com/
2337                          (full-episodes/(?P<episode>.*)|
2338                           (?P<clip>
2339                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2340                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2341                      $"""
2342     IE_NAME = u'comedycentral'
2343
2344     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2345
2346     _video_extensions = {
2347         '3500': 'mp4',
2348         '2200': 'mp4',
2349         '1700': 'mp4',
2350         '1200': 'mp4',
2351         '750': 'mp4',
2352         '400': 'mp4',
2353     }
2354     _video_dimensions = {
2355         '3500': '1280x720',
2356         '2200': '960x540',
2357         '1700': '768x432',
2358         '1200': '640x360',
2359         '750': '512x288',
2360         '400': '384x216',
2361     }
2362
2363     def suitable(self, url):
2364         """Receives a URL and returns True if suitable for this IE."""
2365         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2366
2367     def report_extraction(self, episode_id):
2368         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2369
2370     def report_config_download(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2372
2373     def report_index_download(self, episode_id):
2374         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2375
2376     def report_player_url(self, episode_id):
2377         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2378
2379
2380     def _print_formats(self, formats):
2381         print('Available formats:')
2382         for x in formats:
2383             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2384
2385
2386     def _real_extract(self, url):
2387         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2388         if mobj is None:
2389             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2390             return
2391
2392         if mobj.group('shortname'):
2393             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2394                 url = u'http://www.thedailyshow.com/full-episodes/'
2395             else:
2396                 url = u'http://www.colbertnation.com/full-episodes/'
2397             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2398             assert mobj is not None
2399
2400         if mobj.group('clip'):
2401             if mobj.group('showname') == 'thedailyshow':
2402                 epTitle = mobj.group('tdstitle')
2403             else:
2404                 epTitle = mobj.group('cntitle')
2405             dlNewest = False
2406         else:
2407             dlNewest = not mobj.group('episode')
2408             if dlNewest:
2409                 epTitle = mobj.group('showname')
2410             else:
2411                 epTitle = mobj.group('episode')
2412
2413         req = compat_urllib_request.Request(url)
2414         self.report_extraction(epTitle)
2415         try:
2416             htmlHandle = compat_urllib_request.urlopen(req)
2417             html = htmlHandle.read()
2418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2420             return
2421         if dlNewest:
2422             url = htmlHandle.geturl()
2423             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2424             if mobj is None:
2425                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2426                 return
2427             if mobj.group('episode') == '':
2428                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2429                 return
2430             epTitle = mobj.group('episode')
2431
2432         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2433
2434         if len(mMovieParams) == 0:
2435             # The Colbert Report embeds the information in a without
2436             # a URL prefix; so extract the alternate reference
2437             # and then add the URL prefix manually.
2438
2439             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2440             if len(altMovieParams) == 0:
2441                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2442                 return
2443             else:
2444                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2445
2446         playerUrl_raw = mMovieParams[0][0]
2447         self.report_player_url(epTitle)
2448         try:
2449             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2450             playerUrl = urlHandle.geturl()
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2453             return
2454
2455         uri = mMovieParams[0][1]
2456         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2457         self.report_index_download(epTitle)
2458         try:
2459             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2460         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2461             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2462             return
2463
2464         results = []
2465
2466         idoc = xml.etree.ElementTree.fromstring(indexXml)
2467         itemEls = idoc.findall('.//item')
2468         for itemEl in itemEls:
2469             mediaId = itemEl.findall('./guid')[0].text
2470             shortMediaId = mediaId.split(':')[-1]
2471             showId = mediaId.split(':')[-2].replace('.com', '')
2472             officialTitle = itemEl.findall('./title')[0].text
2473             officialDate = itemEl.findall('./pubDate')[0].text
2474
2475             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2476                         compat_urllib_parse.urlencode({'uri': mediaId}))
2477             configReq = compat_urllib_request.Request(configUrl)
2478             self.report_config_download(epTitle)
2479             try:
2480                 configXml = compat_urllib_request.urlopen(configReq).read()
2481             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2482                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2483                 return
2484
2485             cdoc = xml.etree.ElementTree.fromstring(configXml)
2486             turls = []
2487             for rendition in cdoc.findall('.//rendition'):
2488                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2489                 turls.append(finfo)
2490
2491             if len(turls) == 0:
2492                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2493                 continue
2494
2495             if self._downloader.params.get('listformats', None):
2496                 self._print_formats([i[0] for i in turls])
2497                 return
2498
2499             # For now, just pick the highest bitrate
2500             format,video_url = turls[-1]
2501
2502             # Get the format arg from the arg stream
2503             req_format = self._downloader.params.get('format', None)
2504
2505             # Select format if we can find one
2506             for f,v in turls:
2507                 if f == req_format:
2508                     format, video_url = f, v
2509                     break
2510
2511             # Patch to download from alternative CDN, which does not
2512             # break on current RTMPDump builds
2513             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2514             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2515
2516             if video_url.startswith(broken_cdn):
2517                 video_url = video_url.replace(broken_cdn, better_cdn)
2518
2519             effTitle = showId + u'-' + epTitle
2520             info = {
2521                 'id': shortMediaId,
2522                 'url': video_url,
2523                 'uploader': showId,
2524                 'upload_date': officialDate,
2525                 'title': effTitle,
2526                 'ext': 'mp4',
2527                 'format': format,
2528                 'thumbnail': None,
2529                 'description': officialTitle,
2530                 'player_url': None #playerUrl
2531             }
2532
2533             results.append(info)
2534
2535         return results
2536
2537
2538 class EscapistIE(InfoExtractor):
2539     """Information extractor for The Escapist """
2540
2541     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2542     IE_NAME = u'escapist'
2543
2544     def report_extraction(self, showName):
2545         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2546
2547     def report_config_download(self, showName):
2548         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2549
2550     def _real_extract(self, url):
2551         mobj = re.match(self._VALID_URL, url)
2552         if mobj is None:
2553             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2554             return
2555         showName = mobj.group('showname')
2556         videoId = mobj.group('episode')
2557
2558         self.report_extraction(showName)
2559         try:
2560             webPage = compat_urllib_request.urlopen(url)
2561             webPageBytes = webPage.read()
2562             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2563             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2564         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2566             return
2567
2568         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2569         description = unescapeHTML(descMatch.group(1))
2570         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2571         imgUrl = unescapeHTML(imgMatch.group(1))
2572         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2573         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2574         configUrlMatch = re.search('config=(.*)$', playerUrl)
2575         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2576
2577         self.report_config_download(showName)
2578         try:
2579             configJSON = compat_urllib_request.urlopen(configUrl)
2580             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2581             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2582         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2583             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2584             return
2585
2586         # Technically, it's JavaScript, not JSON
2587         configJSON = configJSON.replace("'", '"')
2588
2589         try:
2590             config = json.loads(configJSON)
2591         except (ValueError,) as err:
2592             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2593             return
2594
2595         playlist = config['playlist']
2596         videoUrl = playlist[1]['url']
2597
2598         info = {
2599             'id': videoId,
2600             'url': videoUrl,
2601             'uploader': showName,
2602             'upload_date': None,
2603             'title': showName,
2604             'ext': 'flv',
2605             'thumbnail': imgUrl,
2606             'description': description,
2607             'player_url': playerUrl,
2608         }
2609
2610         return [info]
2611
2612
2613 class CollegeHumorIE(InfoExtractor):
2614     """Information extractor for collegehumor.com"""
2615
2616     _WORKING = False
2617     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2618     IE_NAME = u'collegehumor'
2619
2620     def report_manifest(self, video_id):
2621         """Report information extraction."""
2622         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2623
2624     def report_extraction(self, video_id):
2625         """Report information extraction."""
2626         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2627
2628     def _real_extract(self, url):
2629         mobj = re.match(self._VALID_URL, url)
2630         if mobj is None:
2631             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2632             return
2633         video_id = mobj.group('videoid')
2634
2635         info = {
2636             'id': video_id,
2637             'uploader': None,
2638             'upload_date': None,
2639         }
2640
2641         self.report_extraction(video_id)
2642         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2643         try:
2644             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2645         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2647             return
2648
2649         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2650         try:
2651             videoNode = mdoc.findall('./video')[0]
2652             info['description'] = videoNode.findall('./description')[0].text
2653             info['title'] = videoNode.findall('./caption')[0].text
2654             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2655             manifest_url = videoNode.findall('./file')[0].text
2656         except IndexError:
2657             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2658             return
2659
2660         manifest_url += '?hdcore=2.10.3'
2661         self.report_manifest(video_id)
2662         try:
2663             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2664         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2665             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2666             return
2667
2668         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2669         try:
2670             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2671             node_id = media_node.attrib['url']
2672             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2673         except IndexError as err:
2674             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2675             return
2676
2677         url_pr = compat_urllib_parse_urlparse(manifest_url)
2678         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2679
2680         info['url'] = url
2681         info['ext'] = 'f4f'
2682         return [info]
2683
2684
2685 class XVideosIE(InfoExtractor):
2686     """Information extractor for xvideos.com"""
2687
2688     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2689     IE_NAME = u'xvideos'
2690
2691     def report_webpage(self, video_id):
2692         """Report information extraction."""
2693         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2694
2695     def report_extraction(self, video_id):
2696         """Report information extraction."""
2697         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698
2699     def _real_extract(self, url):
2700         mobj = re.match(self._VALID_URL, url)
2701         if mobj is None:
2702             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703             return
2704         video_id = mobj.group(1)
2705
2706         self.report_webpage(video_id)
2707
2708         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2709         try:
2710             webpage_bytes = compat_urllib_request.urlopen(request).read()
2711             webpage = webpage_bytes.decode('utf-8', 'replace')
2712         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2713             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2714             return
2715
2716         self.report_extraction(video_id)
2717
2718
2719         # Extract video URL
2720         mobj = re.search(r'flv_url=(.+?)&', webpage)
2721         if mobj is None:
2722             self._downloader.trouble(u'ERROR: unable to extract video url')
2723             return
2724         video_url = compat_urllib_parse.unquote(mobj.group(1))
2725
2726
2727         # Extract title
2728         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2729         if mobj is None:
2730             self._downloader.trouble(u'ERROR: unable to extract video title')
2731             return
2732         video_title = mobj.group(1)
2733
2734
2735         # Extract video thumbnail
2736         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2737         if mobj is None:
2738             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2739             return
2740         video_thumbnail = mobj.group(0)
2741
2742         info = {
2743             'id': video_id,
2744             'url': video_url,
2745             'uploader': None,
2746             'upload_date': None,
2747             'title': video_title,
2748             'ext': 'flv',
2749             'thumbnail': video_thumbnail,
2750             'description': None,
2751         }
2752
2753         return [info]
2754
2755
2756 class SoundcloudIE(InfoExtractor):
2757     """Information extractor for soundcloud.com
2758        To access the media, the uid of the song and a stream token
2759        must be extracted from the page source and the script must make
2760        a request to media.soundcloud.com/crossdomain.xml. Then
2761        the media can be grabbed by requesting from an url composed
2762        of the stream token and uid
2763      """
2764
2765     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2766     IE_NAME = u'soundcloud'
2767
2768     def __init__(self, downloader=None):
2769         InfoExtractor.__init__(self, downloader)
2770
2771     def report_resolve(self, video_id):
2772         """Report information extraction."""
2773         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2774
2775     def report_extraction(self, video_id):
2776         """Report information extraction."""
2777         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2778
2779     def _real_extract(self, url):
2780         mobj = re.match(self._VALID_URL, url)
2781         if mobj is None:
2782             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2783             return
2784
2785         # extract uploader (which is in the url)
2786         uploader = mobj.group(1)
2787         # extract simple title (uploader + slug of song title)
2788         slug_title =  mobj.group(2)
2789         simple_title = uploader + u'-' + slug_title
2790
2791         self.report_resolve('%s/%s' % (uploader, slug_title))
2792
2793         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2794         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2795         request = compat_urllib_request.Request(resolv_url)
2796         try:
2797             info_json_bytes = compat_urllib_request.urlopen(request).read()
2798             info_json = info_json_bytes.decode('utf-8')
2799         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2800             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2801             return
2802
2803         info = json.loads(info_json)
2804         video_id = info['id']
2805         self.report_extraction('%s/%s' % (uploader, slug_title))
2806
2807         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2808         request = compat_urllib_request.Request(streams_url)
2809         try:
2810             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2811             stream_json = stream_json_bytes.decode('utf-8')
2812         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2814             return
2815
2816         streams = json.loads(stream_json)
2817         mediaURL = streams['http_mp3_128_url']
2818
2819         return [{
2820             'id':       info['id'],
2821             'url':      mediaURL,
2822             'uploader': info['user']['username'],
2823             'upload_date':  info['created_at'],
2824             'title':    info['title'],
2825             'ext':      u'mp3',
2826             'description': info['description'],
2827         }]
2828
2829
2830 class InfoQIE(InfoExtractor):
2831     """Information extractor for infoq.com"""
2832
2833     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2834     IE_NAME = u'infoq'
2835
2836     def report_webpage(self, video_id):
2837         """Report information extraction."""
2838         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2839
2840     def report_extraction(self, video_id):
2841         """Report information extraction."""
2842         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2843
2844     def _real_extract(self, url):
2845         mobj = re.match(self._VALID_URL, url)
2846         if mobj is None:
2847             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2848             return
2849
2850         self.report_webpage(url)
2851
2852         request = compat_urllib_request.Request(url)
2853         try:
2854             webpage = compat_urllib_request.urlopen(request).read()
2855         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2856             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2857             return
2858
2859         self.report_extraction(url)
2860
2861
2862         # Extract video URL
2863         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2864         if mobj is None:
2865             self._downloader.trouble(u'ERROR: unable to extract video url')
2866             return
2867         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2868
2869
2870         # Extract title
2871         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2872         if mobj is None:
2873             self._downloader.trouble(u'ERROR: unable to extract video title')
2874             return
2875         video_title = mobj.group(1).decode('utf-8')
2876
2877         # Extract description
2878         video_description = u'No description available.'
2879         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2880         if mobj is not None:
2881             video_description = mobj.group(1).decode('utf-8')
2882
2883         video_filename = video_url.split('/')[-1]
2884         video_id, extension = video_filename.split('.')
2885
2886         info = {
2887             'id': video_id,
2888             'url': video_url,
2889             'uploader': None,
2890             'upload_date': None,
2891             'title': video_title,
2892             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2893             'thumbnail': None,
2894             'description': video_description,
2895         }
2896
2897         return [info]
2898
2899 class MixcloudIE(InfoExtractor):
2900     """Information extractor for www.mixcloud.com"""
2901
2902     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2903     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2904     IE_NAME = u'mixcloud'
2905
2906     def __init__(self, downloader=None):
2907         InfoExtractor.__init__(self, downloader)
2908
2909     def report_download_json(self, file_id):
2910         """Report JSON download."""
2911         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2912
2913     def report_extraction(self, file_id):
2914         """Report information extraction."""
2915         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2916
2917     def get_urls(self, jsonData, fmt, bitrate='best'):
2918         """Get urls from 'audio_formats' section in json"""
2919         file_url = None
2920         try:
2921             bitrate_list = jsonData[fmt]
2922             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2923                 bitrate = max(bitrate_list) # select highest
2924
2925             url_list = jsonData[fmt][bitrate]
2926         except TypeError: # we have no bitrate info.
2927             url_list = jsonData[fmt]
2928         return url_list
2929
2930     def check_urls(self, url_list):
2931         """Returns 1st active url from list"""
2932         for url in url_list:
2933             try:
2934                 compat_urllib_request.urlopen(url)
2935                 return url
2936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2937                 url = None
2938
2939         return None
2940
2941     def _print_formats(self, formats):
2942         print('Available formats:')
2943         for fmt in formats.keys():
2944             for b in formats[fmt]:
2945                 try:
2946                     ext = formats[fmt][b][0]
2947                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2948                 except TypeError: # we have no bitrate info
2949                     ext = formats[fmt][0]
2950                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2951                     break
2952
2953     def _real_extract(self, url):
2954         mobj = re.match(self._VALID_URL, url)
2955         if mobj is None:
2956             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2957             return
2958         # extract uploader & filename from url
2959         uploader = mobj.group(1).decode('utf-8')
2960         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2961
2962         # construct API request
2963         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2964         # retrieve .json file with links to files
2965         request = compat_urllib_request.Request(file_url)
2966         try:
2967             self.report_download_json(file_url)
2968             jsonData = compat_urllib_request.urlopen(request).read()
2969         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2970             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2971             return
2972
2973         # parse JSON
2974         json_data = json.loads(jsonData)
2975         player_url = json_data['player_swf_url']
2976         formats = dict(json_data['audio_formats'])
2977
2978         req_format = self._downloader.params.get('format', None)
2979         bitrate = None
2980
2981         if self._downloader.params.get('listformats', None):
2982             self._print_formats(formats)
2983             return
2984
2985         if req_format is None or req_format == 'best':
2986             for format_param in formats.keys():
2987                 url_list = self.get_urls(formats, format_param)
2988                 # check urls
2989                 file_url = self.check_urls(url_list)
2990                 if file_url is not None:
2991                     break # got it!
2992         else:
2993             if req_format not in formats:
2994                 self._downloader.trouble(u'ERROR: format is not available')
2995                 return
2996
2997             url_list = self.get_urls(formats, req_format)
2998             file_url = self.check_urls(url_list)
2999             format_param = req_format
3000
3001         return [{
3002             'id': file_id.decode('utf-8'),
3003             'url': file_url.decode('utf-8'),
3004             'uploader': uploader.decode('utf-8'),
3005             'upload_date': None,
3006             'title': json_data['name'],
3007             'ext': file_url.split('.')[-1].decode('utf-8'),
3008             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3009             'thumbnail': json_data['thumbnail_url'],
3010             'description': json_data['description'],
3011             'player_url': player_url.decode('utf-8'),
3012         }]
3013
3014 class StanfordOpenClassroomIE(InfoExtractor):
3015     """Information extractor for Stanford's Open ClassRoom"""
3016
3017     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3018     IE_NAME = u'stanfordoc'
3019
3020     def report_download_webpage(self, objid):
3021         """Report information extraction."""
3022         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3023
3024     def report_extraction(self, video_id):
3025         """Report information extraction."""
3026         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3027
3028     def _real_extract(self, url):
3029         mobj = re.match(self._VALID_URL, url)
3030         if mobj is None:
3031             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3032             return
3033
3034         if mobj.group('course') and mobj.group('video'): # A specific video
3035             course = mobj.group('course')
3036             video = mobj.group('video')
3037             info = {
3038                 'id': course + '_' + video,
3039                 'uploader': None,
3040                 'upload_date': None,
3041             }
3042
3043             self.report_extraction(info['id'])
3044             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3045             xmlUrl = baseUrl + video + '.xml'
3046             try:
3047                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3048             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3049                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3050                 return
3051             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3052             try:
3053                 info['title'] = mdoc.findall('./title')[0].text
3054                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3055             except IndexError:
3056                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3057                 return
3058             info['ext'] = info['url'].rpartition('.')[2]
3059             return [info]
3060         elif mobj.group('course'): # A course page
3061             course = mobj.group('course')
3062             info = {
3063                 'id': course,
3064                 'type': 'playlist',
3065                 'uploader': None,
3066                 'upload_date': None,
3067             }
3068
3069             self.report_download_webpage(info['id'])
3070             try:
3071                 coursepage = compat_urllib_request.urlopen(url).read()
3072             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3073                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3074                 return
3075
3076             m = re.search('<h1>([^<]+)</h1>', coursepage)
3077             if m:
3078                 info['title'] = unescapeHTML(m.group(1))
3079             else:
3080                 info['title'] = info['id']
3081
3082             m = re.search('<description>([^<]+)</description>', coursepage)
3083             if m:
3084                 info['description'] = unescapeHTML(m.group(1))
3085
3086             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3087             info['list'] = [
3088                 {
3089                     'type': 'reference',
3090                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3091                 }
3092                     for vpage in links]
3093             results = []
3094             for entry in info['list']:
3095                 assert entry['type'] == 'reference'
3096                 results += self.extract(entry['url'])
3097             return results
3098
3099         else: # Root page
3100             info = {
3101                 'id': 'Stanford OpenClassroom',
3102                 'type': 'playlist',
3103                 'uploader': None,
3104                 'upload_date': None,
3105             }
3106
3107             self.report_download_webpage(info['id'])
3108             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3109             try:
3110                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3111             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3112                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3113                 return
3114
3115             info['title'] = info['id']
3116
3117             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3118             info['list'] = [
3119                 {
3120                     'type': 'reference',
3121                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3122                 }
3123                     for cpage in links]
3124
3125             results = []
3126             for entry in info['list']:
3127                 assert entry['type'] == 'reference'
3128                 results += self.extract(entry['url'])
3129             return results
3130
3131 class MTVIE(InfoExtractor):
3132     """Information extractor for MTV.com"""
3133
3134     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3135     IE_NAME = u'mtv'
3136
3137     def report_webpage(self, video_id):
3138         """Report information extraction."""
3139         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3140
3141     def report_extraction(self, video_id):
3142         """Report information extraction."""
3143         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3144
3145     def _real_extract(self, url):
3146         mobj = re.match(self._VALID_URL, url)
3147         if mobj is None:
3148             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3149             return
3150         if not mobj.group('proto'):
3151             url = 'http://' + url
3152         video_id = mobj.group('videoid')
3153         self.report_webpage(video_id)
3154
3155         request = compat_urllib_request.Request(url)
3156         try:
3157             webpage = compat_urllib_request.urlopen(request).read()
3158         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3160             return
3161
3162         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3163         if mobj is None:
3164             self._downloader.trouble(u'ERROR: unable to extract song name')
3165             return
3166         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3167         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3168         if mobj is None:
3169             self._downloader.trouble(u'ERROR: unable to extract performer')
3170             return
3171         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3172         video_title = performer + ' - ' + song_name
3173
3174         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3175         if mobj is None:
3176             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3177             return
3178         mtvn_uri = mobj.group(1)
3179
3180         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3181         if mobj is None:
3182             self._downloader.trouble(u'ERROR: unable to extract content id')
3183             return
3184         content_id = mobj.group(1)
3185
3186         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3187         self.report_extraction(video_id)
3188         request = compat_urllib_request.Request(videogen_url)
3189         try:
3190             metadataXml = compat_urllib_request.urlopen(request).read()
3191         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3192             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3193             return
3194
3195         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3196         renditions = mdoc.findall('.//rendition')
3197
3198         # For now, always pick the highest quality.
3199         rendition = renditions[-1]
3200
3201         try:
3202             _,_,ext = rendition.attrib['type'].partition('/')
3203             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3204             video_url = rendition.find('./src').text
3205         except KeyError:
3206             self._downloader.trouble('Invalid rendition field.')
3207             return
3208
3209         info = {
3210             'id': video_id,
3211             'url': video_url,
3212             'uploader': performer,
3213             'upload_date': None,
3214             'title': video_title,
3215             'ext': ext,
3216             'format': format,
3217         }
3218
3219         return [info]
3220
3221
3222 class YoukuIE(InfoExtractor):
3223
3224     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3225     IE_NAME = u'Youku'
3226
3227     def __init__(self, downloader=None):
3228         InfoExtractor.__init__(self, downloader)
3229
3230     def report_download_webpage(self, file_id):
3231         """Report webpage download."""
3232         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3233
3234     def report_extraction(self, file_id):
3235         """Report information extraction."""
3236         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3237
3238     def _gen_sid(self):
3239         nowTime = int(time.time() * 1000)
3240         random1 = random.randint(1000,1998)
3241         random2 = random.randint(1000,9999)
3242
3243         return "%d%d%d" %(nowTime,random1,random2)
3244
3245     def _get_file_ID_mix_string(self, seed):
3246         mixed = []
3247         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3248         seed = float(seed)
3249         for i in range(len(source)):
3250             seed  =  (seed * 211 + 30031 ) % 65536
3251             index  =  math.floor(seed / 65536 * len(source) )
3252             mixed.append(source[int(index)])
3253             source.remove(source[int(index)])
3254         #return ''.join(mixed)
3255         return mixed
3256
3257     def _get_file_id(self, fileId, seed):
3258         mixed = self._get_file_ID_mix_string(seed)
3259         ids = fileId.split('*')
3260         realId = []
3261         for ch in ids:
3262             if ch:
3263                 realId.append(mixed[int(ch)])
3264         return ''.join(realId)
3265
3266     def _real_extract(self, url):
3267         mobj = re.match(self._VALID_URL, url)
3268         if mobj is None:
3269             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3270             return
3271         video_id = mobj.group('ID')
3272
3273         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3274
3275         request = compat_urllib_request.Request(info_url, None, std_headers)
3276         try:
3277             self.report_download_webpage(video_id)
3278             jsondata = compat_urllib_request.urlopen(request).read()
3279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3280             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3281             return
3282
3283         self.report_extraction(video_id)
3284         try:
3285             jsonstr = jsondata.decode('utf-8')
3286             config = json.loads(jsonstr)
3287
3288             video_title =  config['data'][0]['title']
3289             seed = config['data'][0]['seed']
3290
3291             format = self._downloader.params.get('format', None)
3292             supported_format = list(config['data'][0]['streamfileids'].keys())
3293
3294             if format is None or format == 'best':
3295                 if 'hd2' in supported_format:
3296                     format = 'hd2'
3297                 else:
3298                     format = 'flv'
3299                 ext = u'flv'
3300             elif format == 'worst':
3301                 format = 'mp4'
3302                 ext = u'mp4'
3303             else:
3304                 format = 'flv'
3305                 ext = u'flv'
3306
3307
3308             fileid = config['data'][0]['streamfileids'][format]
3309             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3310         except (UnicodeDecodeError, ValueError, KeyError):
3311             self._downloader.trouble(u'ERROR: unable to extract info section')
3312             return
3313
3314         files_info=[]
3315         sid = self._gen_sid()
3316         fileid = self._get_file_id(fileid, seed)
3317
3318         #column 8,9 of fileid represent the segment number
3319         #fileid[7:9] should be changed
3320         for index, key in enumerate(keys):
3321
3322             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3323             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3324
3325             info = {
3326                 'id': '%s_part%02d' % (video_id, index),
3327                 'url': download_url,
3328                 'uploader': None,
3329                 'upload_date': None,
3330                 'title': video_title,
3331                 'ext': ext,
3332             }
3333             files_info.append(info)
3334
3335         return files_info
3336
3337
3338 class XNXXIE(InfoExtractor):
3339     """Information extractor for xnxx.com"""
3340
3341     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3342     IE_NAME = u'xnxx'
3343     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3344     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3345     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3346
3347     def report_webpage(self, video_id):
3348         """Report information extraction"""
3349         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3350
3351     def report_extraction(self, video_id):
3352         """Report information extraction"""
3353         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3354
3355     def _real_extract(self, url):
3356         mobj = re.match(self._VALID_URL, url)
3357         if mobj is None:
3358             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3359             return
3360         video_id = mobj.group(1)
3361
3362         self.report_webpage(video_id)
3363
3364         # Get webpage content
3365         try:
3366             webpage_bytes = compat_urllib_request.urlopen(url).read()
3367             webpage = webpage_bytes.decode('utf-8')
3368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3370             return
3371
3372         result = re.search(self.VIDEO_URL_RE, webpage)
3373         if result is None:
3374             self._downloader.trouble(u'ERROR: unable to extract video url')
3375             return
3376         video_url = compat_urllib_parse.unquote(result.group(1))
3377
3378         result = re.search(self.VIDEO_TITLE_RE, webpage)
3379         if result is None:
3380             self._downloader.trouble(u'ERROR: unable to extract video title')
3381             return
3382         video_title = result.group(1)
3383
3384         result = re.search(self.VIDEO_THUMB_RE, webpage)
3385         if result is None:
3386             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3387             return
3388         video_thumbnail = result.group(1)
3389
3390         return [{
3391             'id': video_id,
3392             'url': video_url,
3393             'uploader': None,
3394             'upload_date': None,
3395             'title': video_title,
3396             'ext': 'flv',
3397             'thumbnail': video_thumbnail,
3398             'description': None,
3399         }]
3400
3401
3402 class GooglePlusIE(InfoExtractor):
3403     """Information extractor for plus.google.com."""
3404
3405     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3406     IE_NAME = u'plus.google'
3407
3408     def __init__(self, downloader=None):
3409         InfoExtractor.__init__(self, downloader)
3410
3411     def report_extract_entry(self, url):
3412         """Report downloading extry"""
3413         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3414
3415     def report_date(self, upload_date):
3416         """Report downloading extry"""
3417         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3418
3419     def report_uploader(self, uploader):
3420         """Report downloading extry"""
3421         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3422
3423     def report_title(self, video_title):
3424         """Report downloading extry"""
3425         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3426
3427     def report_extract_vid_page(self, video_page):
3428         """Report information extraction."""
3429         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3430
3431     def _real_extract(self, url):
3432         # Extract id from URL
3433         mobj = re.match(self._VALID_URL, url)
3434         if mobj is None:
3435             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3436             return
3437
3438         post_url = mobj.group(0)
3439         video_id = mobj.group(1)
3440
3441         video_extension = 'flv'
3442
3443         # Step 1, Retrieve post webpage to extract further information
3444         self.report_extract_entry(post_url)
3445         request = compat_urllib_request.Request(post_url)
3446         try:
3447             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3448         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3449             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3450             return
3451
3452         # Extract update date
3453         upload_date = None
3454         pattern = 'title="Timestamp">(.*?)</a>'
3455         mobj = re.search(pattern, webpage)
3456         if mobj:
3457             upload_date = mobj.group(1)
3458             # Convert timestring to a format suitable for filename
3459             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3460             upload_date = upload_date.strftime('%Y%m%d')
3461         self.report_date(upload_date)
3462
3463         # Extract uploader
3464         uploader = None
3465         pattern = r'rel\="author".*?>(.*?)</a>'
3466         mobj = re.search(pattern, webpage)
3467         if mobj:
3468             uploader = mobj.group(1)
3469         self.report_uploader(uploader)
3470
3471         # Extract title
3472         # Get the first line for title
3473         video_title = u'NA'
3474         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3475         mobj = re.search(pattern, webpage)
3476         if mobj:
3477             video_title = mobj.group(1)
3478         self.report_title(video_title)
3479
3480         # Step 2, Stimulate clicking the image box to launch video
3481         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3482         mobj = re.search(pattern, webpage)
3483         if mobj is None:
3484             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3485
3486         video_page = mobj.group(1)
3487         request = compat_urllib_request.Request(video_page)
3488         try:
3489             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3491             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3492             return
3493         self.report_extract_vid_page(video_page)
3494
3495
3496         # Extract video links on video page
3497         """Extract video links of all sizes"""
3498         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3499         mobj = re.findall(pattern, webpage)
3500         if len(mobj) == 0:
3501             self._downloader.trouble(u'ERROR: unable to extract video links')
3502
3503         # Sort in resolution
3504         links = sorted(mobj)
3505
3506         # Choose the lowest of the sort, i.e. highest resolution
3507         video_url = links[-1]
3508         # Only get the url. The resolution part in the tuple has no use anymore
3509         video_url = video_url[-1]
3510         # Treat escaped \u0026 style hex
3511         try:
3512             video_url = video_url.decode("unicode_escape")
3513         except AttributeError: # Python 3
3514             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3515
3516
3517         return [{
3518             'id':       video_id,
3519             'url':      video_url,
3520             'uploader': uploader,
3521             'upload_date':  upload_date,
3522             'title':    video_title,
3523             'ext':      video_extension,
3524         }]
3525
3526 class NBAIE(InfoExtractor):
3527     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3528     IE_NAME = u'nba'
3529
3530     def report_extraction(self, video_id):
3531         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3532
3533     def _real_extract(self, url):
3534         mobj = re.match(self._VALID_URL, url)
3535         if mobj is None:
3536             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3537             return
3538
3539         video_id = mobj.group(1)
3540         if video_id.endswith('/index.html'):
3541             video_id = video_id[:-len('/index.html')]
3542
3543         self.report_extraction(video_id)
3544         try:
3545             urlh = compat_urllib_request.urlopen(url)
3546             webpage_bytes = urlh.read()
3547             webpage = webpage_bytes.decode('utf-8', 'ignore')
3548         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3549             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3550             return
3551
3552         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3553         def _findProp(rexp, default=None):
3554             m = re.search(rexp, webpage)
3555             if m:
3556                 return unescapeHTML(m.group(1))
3557             else:
3558                 return default
3559
3560         shortened_video_id = video_id.rpartition('/')[2]
3561         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3562         info = {
3563             'id': shortened_video_id,
3564             'url': video_url,
3565             'ext': 'mp4',
3566             'title': title,
3567             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3568             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3569         }
3570         return [info]
3571
3572 class JustinTVIE(InfoExtractor):
3573     """Information extractor for justin.tv and twitch.tv"""
3574     # TODO: One broadcast may be split into multiple videos. The key
3575     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3576     # starts at 1 and increases. Can we treat all parts as one video?
3577
3578     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3579         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3580     _JUSTIN_PAGE_LIMIT = 100
3581     IE_NAME = u'justin.tv'
3582
3583     def report_extraction(self, file_id):
3584         """Report information extraction."""
3585         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3586
3587     def report_download_page(self, channel, offset):
3588         """Report attempt to download a single page of videos."""
3589         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3590                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3591
3592     # Return count of items, list of *valid* items
3593     def _parse_page(self, url):
3594         try:
3595             urlh = compat_urllib_request.urlopen(url)
3596             webpage_bytes = urlh.read()
3597             webpage = webpage_bytes.decode('utf-8', 'ignore')
3598         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3599             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3600             return
3601
3602         response = json.loads(webpage)
3603         info = []
3604         for clip in response:
3605             video_url = clip['video_file_url']
3606             if video_url:
3607                 video_extension = os.path.splitext(video_url)[1][1:]
3608                 video_date = re.sub('-', '', clip['created_on'][:10])
3609                 info.append({
3610                     'id': clip['id'],
3611                     'url': video_url,
3612                     'title': clip['title'],
3613                     'uploader': clip.get('user_id', clip.get('channel_id')),
3614                     'upload_date': video_date,
3615                     'ext': video_extension,
3616                 })
3617         return (len(response), info)
3618
3619     def _real_extract(self, url):
3620         mobj = re.match(self._VALID_URL, url)
3621         if mobj is None:
3622             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3623             return
3624
3625         api = 'http://api.justin.tv'
3626         video_id = mobj.group(mobj.lastindex)
3627         paged = False
3628         if mobj.lastindex == 1:
3629             paged = True
3630             api += '/channel/archives/%s.json'
3631         else:
3632             api += '/clip/show/%s.json'
3633         api = api % (video_id,)
3634
3635         self.report_extraction(video_id)
3636
3637         info = []
3638         offset = 0
3639         limit = self._JUSTIN_PAGE_LIMIT
3640         while True:
3641             if paged:
3642                 self.report_download_page(video_id, offset)
3643             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3644             page_count, page_info = self._parse_page(page_url)
3645             info.extend(page_info)
3646             if not paged or page_count != limit:
3647                 break
3648             offset += limit
3649         return info
3650
3651 class FunnyOrDieIE(InfoExtractor):
3652     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3653     IE_NAME = u'FunnyOrDie'
3654
3655     def report_extraction(self, video_id):
3656         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3657
3658     def _real_extract(self, url):
3659         mobj = re.match(self._VALID_URL, url)
3660         if mobj is None:
3661             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3662             return
3663
3664         video_id = mobj.group('id')
3665         self.report_extraction(video_id)
3666         try:
3667             urlh = compat_urllib_request.urlopen(url)
3668             webpage_bytes = urlh.read()
3669             webpage = webpage_bytes.decode('utf-8', 'ignore')
3670         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3671             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3672             return
3673
3674         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3675         if not m:
3676             self._downloader.trouble(u'ERROR: unable to find video information')
3677         video_url = unescapeHTML(m.group('url'))
3678
3679         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3680         if not m:
3681             self._downloader.trouble(u'Cannot find video title')
3682         title = unescapeHTML(m.group('title'))
3683
3684         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3685         if m:
3686             desc = unescapeHTML(m.group('desc'))
3687         else:
3688             desc = None
3689
3690         info = {
3691             'id': video_id,
3692             'url': video_url,
3693             'ext': 'mp4',
3694             'title': title,
3695             'description': desc,
3696         }
3697         return [info]
3698
3699 class TweetReelIE(InfoExtractor):
3700     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3701
3702     def report_extraction(self, video_id):
3703         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3704
3705     def _real_extract(self, url):
3706         mobj = re.match(self._VALID_URL, url)
3707         if mobj is None:
3708             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3709             return
3710
3711         video_id = mobj.group('id')
3712         self.report_extraction(video_id)
3713         try:
3714             urlh = compat_urllib_request.urlopen(url)
3715             webpage_bytes = urlh.read()
3716             webpage = webpage_bytes.decode('utf-8', 'ignore')
3717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3718             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3719             return
3720
3721         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3722         if not m:
3723             self._downloader.trouble(u'ERROR: Cannot find status ID')
3724         status_id = m.group(1)
3725
3726         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3727         if not m:
3728             self._downloader.trouble(u'WARNING: Cannot find description')
3729         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3730
3731         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3732         if not m:
3733             self._downloader.trouble(u'ERROR: Cannot find uploader')
3734         uploader = unescapeHTML(m.group('uploader'))
3735         uploader_id = unescapeHTML(m.group('uploader_id'))
3736
3737         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3738         if not m:
3739             self._downloader.trouble(u'ERROR: Cannot find upload date')
3740         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3741
3742         title = desc
3743         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3744
3745         info = {
3746             'id': video_id,
3747             'url': video_url,
3748             'ext': 'mov',
3749             'title': title,
3750             'description': desc,
3751             'uploader': uploader,
3752             'uploader_id': uploader_id,
3753             'internal_id': status_id,
3754             'upload_date': upload_date
3755         }
3756         return [info]