refactor YouTube subtitles code, it was ugly (my bad)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21     """Information Extractor class.
22
23     Information extractors are the classes that, given a URL, extract
24     information about the video (or videos) the URL refers to. This
25     information includes the real video URL, the video title, author and
26     others. The information is stored in a dictionary which is then
27     passed to the FileDownloader. The FileDownloader processes this
28     information possibly downloading the video to the file system, among
29     other possible outcomes.
30
31     The dictionaries must include the following fields:
32
33     id:             Video identifier.
34     url:            Final video URL.
35     uploader:       Nickname of the video uploader, unescaped.
36     upload_date:    Video upload date (YYYYMMDD).
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     player_url:     SWF Player URL (used for rtmpdump).
46     subtitles:      The .srt file contents.
47     urlhandle:      [internal] The urlHandle to be used to download the file,
48                     like returned by urllib.request.urlopen
49
50     The fields should all be Unicode strings.
51
52     Subclasses of this one should re-define the _real_initialize() and
53     _real_extract() methods and define a _VALID_URL regexp.
54     Probably, they should also be added to the list of extractors.
55
56     _real_extract() must return a *list* of information dictionaries as
57     described above.
58
59     Finally, the _WORKING attribute should be set to False for broken IEs
60     in order to warn the users and skip the tests.
61     """
62
63     _ready = False
64     _downloader = None
65     _WORKING = True
66
67     def __init__(self, downloader=None):
68         """Constructor. Receives an optional downloader."""
69         self._ready = False
70         self.set_downloader(downloader)
71
72     def suitable(self, url):
73         """Receives a URL and returns True if suitable for this IE."""
74         return re.match(self._VALID_URL, url) is not None
75
76     def working(self):
77         """Getter method for _WORKING."""
78         return self._WORKING
79
80     def initialize(self):
81         """Initializes an instance (authentication, etc)."""
82         if not self._ready:
83             self._real_initialize()
84             self._ready = True
85
86     def extract(self, url):
87         """Extracts URL information and returns it in list of dicts."""
88         self.initialize()
89         return self._real_extract(url)
90
91     def set_downloader(self, downloader):
92         """Sets the downloader for this IE."""
93         self._downloader = downloader
94
95     def _real_initialize(self):
96         """Real initialization process. Redefine in subclasses."""
97         pass
98
99     def _real_extract(self, url):
100         """Real extraction process. Redefine in subclasses."""
101         pass
102
103
104 class YoutubeIE(InfoExtractor):
105     """Information extractor for youtube.com."""
106
107     _VALID_URL = r"""^
108                      (
109                          (?:https?://)?                                       # http(s):// (optional)
110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
114                          (?:                                                  # the various things that can precede the ID:
115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
116                              |(?:                                             # or the v= param in all its forms
117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
120                                  v=
121                              )
122                          )?                                                   # optional -> youtube.com/xxxx is OK
123                      )?                                                       # all until now is optional -> you can pass the naked ID
124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
125                      (?(1).+)?                                                # if we found the ID, everything can follow
126                      $"""
127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131     _NETRC_MACHINE = 'youtube'
132     # Listed in order of quality
133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135     _video_extensions = {
136         '13': '3gp',
137         '17': 'mp4',
138         '18': 'mp4',
139         '22': 'mp4',
140         '37': 'mp4',
141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142         '43': 'webm',
143         '44': 'webm',
144         '45': 'webm',
145         '46': 'webm',
146     }
147     _video_dimensions = {
148         '5': '240x400',
149         '6': '???',
150         '13': '???',
151         '17': '144x176',
152         '18': '360x640',
153         '22': '720x1280',
154         '34': '360x640',
155         '35': '480x854',
156         '37': '1080x1920',
157         '38': '3072x4096',
158         '43': '360x640',
159         '44': '480x854',
160         '45': '720x1280',
161         '46': '1080x1920',
162     }
163     IE_NAME = u'youtube'
164
165     def suitable(self, url):
166         """Receives a URL and returns True if suitable for this IE."""
167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169     def report_lang(self):
170         """Report attempt to set language."""
171         self._downloader.to_screen(u'[youtube] Setting language')
172
173     def report_login(self):
174         """Report attempt to log in."""
175         self._downloader.to_screen(u'[youtube] Logging in')
176
177     def report_age_confirmation(self):
178         """Report attempt to confirm age."""
179         self._downloader.to_screen(u'[youtube] Confirming age')
180
181     def report_video_webpage_download(self, video_id):
182         """Report attempt to download video webpage."""
183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185     def report_video_info_webpage_download(self, video_id):
186         """Report attempt to download video info webpage."""
187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189     def report_video_subtitles_download(self, video_id):
190         """Report attempt to download video info webpage."""
191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193     def report_information_extraction(self, video_id):
194         """Report attempt to extract video information."""
195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197     def report_unavailable_format(self, video_id, format):
198         """Report extracted video URL."""
199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201     def report_rtmp_download(self):
202         """Indicate the download will use the RTMP protocol."""
203         self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205     def _closed_captions_xml_to_srt(self, xml_string):
206         srt = ''
207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208         # TODO parse xml instead of regex
209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
210             if not dur: dur = '4'
211             start = float(start)
212             end = start + float(dur)
213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215             caption = unescapeHTML(caption)
216             caption = unescapeHTML(caption) # double cycle, intentional
217             srt += str(n+1) + '\n'
218             srt += start + ' --> ' + end + '\n'
219             srt += caption + '\n\n'
220         return srt
221
222     def _extract_subtitles(self, video_id):
223         self.report_video_subtitles_download(video_id)
224         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
225         try:
226             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
228             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
229         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
230         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
231         if not srt_lang_list:
232             return (u'WARNING: video has no closed captions', None)
233         if self._downloader.params.get('subtitleslang', False):
234             srt_lang = self._downloader.params.get('subtitleslang')
235         elif 'en' in srt_lang_list:
236             srt_lang = 'en'
237         else:
238             srt_lang = srt_lang_list.keys()[0]
239         if not srt_lang in srt_lang_list:
240             return (u'WARNING: no closed captions found in the specified language', None)
241         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
242         try:
243             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
244         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
245             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
246         if not srt_xml:
247             return (u'WARNING: unable to download video subtitles', None)
248         return (None, self._closed_captions_xml_to_srt(srt_xml))
249
250     def _print_formats(self, formats):
251         print('Available formats:')
252         for x in formats:
253             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
254
255     def _real_initialize(self):
256         if self._downloader is None:
257             return
258
259         username = None
260         password = None
261         downloader_params = self._downloader.params
262
263         # Attempt to use provided username and password or .netrc data
264         if downloader_params.get('username', None) is not None:
265             username = downloader_params['username']
266             password = downloader_params['password']
267         elif downloader_params.get('usenetrc', False):
268             try:
269                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
270                 if info is not None:
271                     username = info[0]
272                     password = info[2]
273                 else:
274                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
275             except (IOError, netrc.NetrcParseError) as err:
276                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
277                 return
278
279         # Set language
280         request = compat_urllib_request.Request(self._LANG_URL)
281         try:
282             self.report_lang()
283             compat_urllib_request.urlopen(request).read()
284         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
285             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
286             return
287
288         # No authentication to be performed
289         if username is None:
290             return
291
292         # Log in
293         login_form = {
294                 'current_form': 'loginForm',
295                 'next':     '/',
296                 'action_login': 'Log In',
297                 'username': username,
298                 'password': password,
299                 }
300         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
301         try:
302             self.report_login()
303             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
304             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
305                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
306                 return
307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
308             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
309             return
310
311         # Confirm age
312         age_form = {
313                 'next_url':     '/',
314                 'action_confirm':   'Confirm',
315                 }
316         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
317         try:
318             self.report_age_confirmation()
319             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
320         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
321             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
322             return
323
324     def _real_extract(self, url):
325         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
326         mobj = re.search(self._NEXT_URL_RE, url)
327         if mobj:
328             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
329
330         # Extract video id from URL
331         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
332         if mobj is None:
333             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
334             return
335         video_id = mobj.group(2)
336
337         # Get video webpage
338         self.report_video_webpage_download(video_id)
339         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
340         try:
341             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
344             return
345
346         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
347
348         # Attempt to extract SWF player URL
349         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
350         if mobj is not None:
351             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
352         else:
353             player_url = None
354
355         # Get video info
356         self.report_video_info_webpage_download(video_id)
357         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
358             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
359                     % (video_id, el_type))
360             request = compat_urllib_request.Request(video_info_url)
361             try:
362                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
363                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
364                 video_info = compat_parse_qs(video_info_webpage)
365                 if 'token' in video_info:
366                     break
367             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
368                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
369                 return
370         if 'token' not in video_info:
371             if 'reason' in video_info:
372                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
373             else:
374                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
375             return
376
377         # Check for "rental" videos
378         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
379             self._downloader.trouble(u'ERROR: "rental" videos not supported')
380             return
381
382         # Start extracting information
383         self.report_information_extraction(video_id)
384
385         # uploader
386         if 'author' not in video_info:
387             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
388             return
389         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
390
391         # title
392         if 'title' not in video_info:
393             self._downloader.trouble(u'ERROR: unable to extract video title')
394             return
395         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
396
397         # thumbnail image
398         if 'thumbnail_url' not in video_info:
399             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
400             video_thumbnail = ''
401         else:   # don't panic if we can't find it
402             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
403
404         # upload date
405         upload_date = None
406         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
407         if mobj is not None:
408             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
409             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
410             for expression in format_expressions:
411                 try:
412                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
413                 except:
414                     pass
415
416         # description
417         video_description = get_element_by_id("eow-description", video_webpage)
418         if video_description:
419             video_description = clean_html(video_description)
420         else:
421             video_description = ''
422
423         # closed captions
424         video_subtitles = None
425         if self._downloader.params.get('writesubtitles', False):
426             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
427             if srt_error:
428                 self._downloader.trouble(srt_error)
429
430         if 'length_seconds' not in video_info:
431             self._downloader.trouble(u'WARNING: unable to extract video duration')
432             video_duration = ''
433         else:
434             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
435
436         # token
437         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
438
439         # Decide which formats to download
440         req_format = self._downloader.params.get('format', None)
441
442         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
443             self.report_rtmp_download()
444             video_url_list = [(None, video_info['conn'][0])]
445         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
446             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
447             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
448             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
449             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
450
451             format_limit = self._downloader.params.get('format_limit', None)
452             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
453             if format_limit is not None and format_limit in available_formats:
454                 format_list = available_formats[available_formats.index(format_limit):]
455             else:
456                 format_list = available_formats
457             existing_formats = [x for x in format_list if x in url_map]
458             if len(existing_formats) == 0:
459                 self._downloader.trouble(u'ERROR: no known formats available for video')
460                 return
461             if self._downloader.params.get('listformats', None):
462                 self._print_formats(existing_formats)
463                 return
464             if req_format is None or req_format == 'best':
465                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
466             elif req_format == 'worst':
467                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
468             elif req_format in ('-1', 'all'):
469                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
470             else:
471                 # Specific formats. We pick the first in a slash-delimeted sequence.
472                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
473                 req_formats = req_format.split('/')
474                 video_url_list = None
475                 for rf in req_formats:
476                     if rf in url_map:
477                         video_url_list = [(rf, url_map[rf])]
478                         break
479                 if video_url_list is None:
480                     self._downloader.trouble(u'ERROR: requested format not available')
481                     return
482         else:
483             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
484             return
485
486         results = []
487         for format_param, video_real_url in video_url_list:
488             # Extension
489             video_extension = self._video_extensions.get(format_param, 'flv')
490
491             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
492                                               self._video_dimensions.get(format_param, '???'))
493
494             results.append({
495                 'id':       video_id,
496                 'url':      video_real_url,
497                 'uploader': video_uploader,
498                 'upload_date':  upload_date,
499                 'title':    video_title,
500                 'ext':      video_extension,
501                 'format':   video_format,
502                 'thumbnail':    video_thumbnail,
503                 'description':  video_description,
504                 'player_url':   player_url,
505                 'subtitles':    video_subtitles,
506                 'duration':     video_duration
507             })
508         return results
509
510
511 class MetacafeIE(InfoExtractor):
512     """Information Extractor for metacafe.com."""
513
514     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
515     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
516     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
517     IE_NAME = u'metacafe'
518
519     def __init__(self, downloader=None):
520         InfoExtractor.__init__(self, downloader)
521
522     def report_disclaimer(self):
523         """Report disclaimer retrieval."""
524         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
525
526     def report_age_confirmation(self):
527         """Report attempt to confirm age."""
528         self._downloader.to_screen(u'[metacafe] Confirming age')
529
530     def report_download_webpage(self, video_id):
531         """Report webpage download."""
532         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
533
534     def report_extraction(self, video_id):
535         """Report information extraction."""
536         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
537
538     def _real_initialize(self):
539         # Retrieve disclaimer
540         request = compat_urllib_request.Request(self._DISCLAIMER)
541         try:
542             self.report_disclaimer()
543             disclaimer = compat_urllib_request.urlopen(request).read()
544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
546             return
547
548         # Confirm age
549         disclaimer_form = {
550             'filters': '0',
551             'submit': "Continue - I'm over 18",
552             }
553         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
554         try:
555             self.report_age_confirmation()
556             disclaimer = compat_urllib_request.urlopen(request).read()
557         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
558             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
559             return
560
561     def _real_extract(self, url):
562         # Extract id and simplified title from URL
563         mobj = re.match(self._VALID_URL, url)
564         if mobj is None:
565             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
566             return
567
568         video_id = mobj.group(1)
569
570         # Check if video comes from YouTube
571         mobj2 = re.match(r'^yt-(.*)$', video_id)
572         if mobj2 is not None:
573             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
574             return
575
576         # Retrieve video webpage to extract further information
577         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
578         try:
579             self.report_download_webpage(video_id)
580             webpage = compat_urllib_request.urlopen(request).read()
581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
583             return
584
585         # Extract URL, uploader and title from webpage
586         self.report_extraction(video_id)
587         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
588         if mobj is not None:
589             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
590             video_extension = mediaURL[-3:]
591
592             # Extract gdaKey if available
593             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
594             if mobj is None:
595                 video_url = mediaURL
596             else:
597                 gdaKey = mobj.group(1)
598                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
599         else:
600             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
601             if mobj is None:
602                 self._downloader.trouble(u'ERROR: unable to extract media URL')
603                 return
604             vardict = compat_parse_qs(mobj.group(1))
605             if 'mediaData' not in vardict:
606                 self._downloader.trouble(u'ERROR: unable to extract media URL')
607                 return
608             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
609             if mobj is None:
610                 self._downloader.trouble(u'ERROR: unable to extract media URL')
611                 return
612             mediaURL = mobj.group(1).replace('\\/', '/')
613             video_extension = mediaURL[-3:]
614             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
615
616         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
617         if mobj is None:
618             self._downloader.trouble(u'ERROR: unable to extract title')
619             return
620         video_title = mobj.group(1).decode('utf-8')
621
622         mobj = re.search(r'submitter=(.*?);', webpage)
623         if mobj is None:
624             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
625             return
626         video_uploader = mobj.group(1)
627
628         return [{
629             'id':       video_id.decode('utf-8'),
630             'url':      video_url.decode('utf-8'),
631             'uploader': video_uploader.decode('utf-8'),
632             'upload_date':  None,
633             'title':    video_title,
634             'ext':      video_extension.decode('utf-8'),
635         }]
636
637
638 class DailymotionIE(InfoExtractor):
639     """Information Extractor for Dailymotion"""
640
641     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
642     IE_NAME = u'dailymotion'
643
644     def __init__(self, downloader=None):
645         InfoExtractor.__init__(self, downloader)
646
647     def report_download_webpage(self, video_id):
648         """Report webpage download."""
649         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
650
651     def report_extraction(self, video_id):
652         """Report information extraction."""
653         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
654
655     def _real_extract(self, url):
656         # Extract id and simplified title from URL
657         mobj = re.match(self._VALID_URL, url)
658         if mobj is None:
659             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
660             return
661
662         video_id = mobj.group(1).split('_')[0].split('?')[0]
663
664         video_extension = 'mp4'
665
666         # Retrieve video webpage to extract further information
667         request = compat_urllib_request.Request(url)
668         request.add_header('Cookie', 'family_filter=off')
669         try:
670             self.report_download_webpage(video_id)
671             webpage_bytes = compat_urllib_request.urlopen(request).read()
672             webpage = webpage_bytes.decode('utf-8')
673         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
674             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
675             return
676
677         # Extract URL, uploader and title from webpage
678         self.report_extraction(video_id)
679         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
680         if mobj is None:
681             self._downloader.trouble(u'ERROR: unable to extract media URL')
682             return
683         flashvars = compat_urllib_parse.unquote(mobj.group(1))
684
685         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
686             if key in flashvars:
687                 max_quality = key
688                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
689                 break
690         else:
691             self._downloader.trouble(u'ERROR: unable to extract video URL')
692             return
693
694         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
695         if mobj is None:
696             self._downloader.trouble(u'ERROR: unable to extract video URL')
697             return
698
699         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
700
701         # TODO: support choosing qualities
702
703         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
704         if mobj is None:
705             self._downloader.trouble(u'ERROR: unable to extract title')
706             return
707         video_title = unescapeHTML(mobj.group('title'))
708
709         video_uploader = None
710         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
711         if mobj is None:
712             # lookin for official user
713             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
714             if mobj_official is None:
715                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
716             else:
717                 video_uploader = mobj_official.group(1)
718         else:
719             video_uploader = mobj.group(1)
720
721         video_upload_date = None
722         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
723         if mobj is not None:
724             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
725
726         return [{
727             'id':       video_id,
728             'url':      video_url,
729             'uploader': video_uploader,
730             'upload_date':  video_upload_date,
731             'title':    video_title,
732             'ext':      video_extension,
733         }]
734
735
736 class PhotobucketIE(InfoExtractor):
737     """Information extractor for photobucket.com."""
738
739     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
740     IE_NAME = u'photobucket'
741
742     def __init__(self, downloader=None):
743         InfoExtractor.__init__(self, downloader)
744
745     def report_download_webpage(self, video_id):
746         """Report webpage download."""
747         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
748
749     def report_extraction(self, video_id):
750         """Report information extraction."""
751         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
752
753     def _real_extract(self, url):
754         # Extract id from URL
755         mobj = re.match(self._VALID_URL, url)
756         if mobj is None:
757             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
758             return
759
760         video_id = mobj.group(1)
761
762         video_extension = 'flv'
763
764         # Retrieve video webpage to extract further information
765         request = compat_urllib_request.Request(url)
766         try:
767             self.report_download_webpage(video_id)
768             webpage = compat_urllib_request.urlopen(request).read()
769         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
770             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
771             return
772
773         # Extract URL, uploader, and title from webpage
774         self.report_extraction(video_id)
775         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
776         if mobj is None:
777             self._downloader.trouble(u'ERROR: unable to extract media URL')
778             return
779         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
780
781         video_url = mediaURL
782
783         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
784         if mobj is None:
785             self._downloader.trouble(u'ERROR: unable to extract title')
786             return
787         video_title = mobj.group(1).decode('utf-8')
788
789         video_uploader = mobj.group(2).decode('utf-8')
790
791         return [{
792             'id':       video_id.decode('utf-8'),
793             'url':      video_url.decode('utf-8'),
794             'uploader': video_uploader,
795             'upload_date':  None,
796             'title':    video_title,
797             'ext':      video_extension.decode('utf-8'),
798         }]
799
800
801 class YahooIE(InfoExtractor):
802     """Information extractor for video.yahoo.com."""
803
804     _WORKING = False
805     # _VALID_URL matches all Yahoo! Video URLs
806     # _VPAGE_URL matches only the extractable '/watch/' URLs
807     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
808     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
809     IE_NAME = u'video.yahoo'
810
811     def __init__(self, downloader=None):
812         InfoExtractor.__init__(self, downloader)
813
814     def report_download_webpage(self, video_id):
815         """Report webpage download."""
816         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
817
818     def report_extraction(self, video_id):
819         """Report information extraction."""
820         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
821
822     def _real_extract(self, url, new_video=True):
823         # Extract ID from URL
824         mobj = re.match(self._VALID_URL, url)
825         if mobj is None:
826             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
827             return
828
829         video_id = mobj.group(2)
830         video_extension = 'flv'
831
832         # Rewrite valid but non-extractable URLs as
833         # extractable English language /watch/ URLs
834         if re.match(self._VPAGE_URL, url) is None:
835             request = compat_urllib_request.Request(url)
836             try:
837                 webpage = compat_urllib_request.urlopen(request).read()
838             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
840                 return
841
842             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
843             if mobj is None:
844                 self._downloader.trouble(u'ERROR: Unable to extract id field')
845                 return
846             yahoo_id = mobj.group(1)
847
848             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
849             if mobj is None:
850                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
851                 return
852             yahoo_vid = mobj.group(1)
853
854             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
855             return self._real_extract(url, new_video=False)
856
857         # Retrieve video webpage to extract further information
858         request = compat_urllib_request.Request(url)
859         try:
860             self.report_download_webpage(video_id)
861             webpage = compat_urllib_request.urlopen(request).read()
862         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
863             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
864             return
865
866         # Extract uploader and title from webpage
867         self.report_extraction(video_id)
868         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
869         if mobj is None:
870             self._downloader.trouble(u'ERROR: unable to extract video title')
871             return
872         video_title = mobj.group(1).decode('utf-8')
873
874         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
875         if mobj is None:
876             self._downloader.trouble(u'ERROR: unable to extract video uploader')
877             return
878         video_uploader = mobj.group(1).decode('utf-8')
879
880         # Extract video thumbnail
881         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
882         if mobj is None:
883             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
884             return
885         video_thumbnail = mobj.group(1).decode('utf-8')
886
887         # Extract video description
888         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
889         if mobj is None:
890             self._downloader.trouble(u'ERROR: unable to extract video description')
891             return
892         video_description = mobj.group(1).decode('utf-8')
893         if not video_description:
894             video_description = 'No description available.'
895
896         # Extract video height and width
897         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
898         if mobj is None:
899             self._downloader.trouble(u'ERROR: unable to extract video height')
900             return
901         yv_video_height = mobj.group(1)
902
903         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
904         if mobj is None:
905             self._downloader.trouble(u'ERROR: unable to extract video width')
906             return
907         yv_video_width = mobj.group(1)
908
909         # Retrieve video playlist to extract media URL
910         # I'm not completely sure what all these options are, but we
911         # seem to need most of them, otherwise the server sends a 401.
912         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
913         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
914         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
915                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
916                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
917         try:
918             self.report_download_webpage(video_id)
919             webpage = compat_urllib_request.urlopen(request).read()
920         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
921             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
922             return
923
924         # Extract media URL from playlist XML
925         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
926         if mobj is None:
927             self._downloader.trouble(u'ERROR: Unable to extract media URL')
928             return
929         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
930         video_url = unescapeHTML(video_url)
931
932         return [{
933             'id':       video_id.decode('utf-8'),
934             'url':      video_url,
935             'uploader': video_uploader,
936             'upload_date':  None,
937             'title':    video_title,
938             'ext':      video_extension.decode('utf-8'),
939             'thumbnail':    video_thumbnail.decode('utf-8'),
940             'description':  video_description,
941         }]
942
943
944 class VimeoIE(InfoExtractor):
945     """Information extractor for vimeo.com."""
946
947     # _VALID_URL matches Vimeo URLs
948     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
949     IE_NAME = u'vimeo'
950
951     def __init__(self, downloader=None):
952         InfoExtractor.__init__(self, downloader)
953
954     def report_download_webpage(self, video_id):
955         """Report webpage download."""
956         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
957
958     def report_extraction(self, video_id):
959         """Report information extraction."""
960         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
961
962     def _real_extract(self, url, new_video=True):
963         # Extract ID from URL
964         mobj = re.match(self._VALID_URL, url)
965         if mobj is None:
966             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
967             return
968
969         video_id = mobj.group(1)
970
971         # Retrieve video webpage to extract further information
972         request = compat_urllib_request.Request(url, None, std_headers)
973         try:
974             self.report_download_webpage(video_id)
975             webpage_bytes = compat_urllib_request.urlopen(request).read()
976             webpage = webpage_bytes.decode('utf-8')
977         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
978             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
979             return
980
981         # Now we begin extracting as much information as we can from what we
982         # retrieved. First we extract the information common to all extractors,
983         # and latter we extract those that are Vimeo specific.
984         self.report_extraction(video_id)
985
986         # Extract the config JSON
987         try:
988             config = webpage.split(' = {config:')[1].split(',assets:')[0]
989             config = json.loads(config)
990         except:
991             self._downloader.trouble(u'ERROR: unable to extract info section')
992             return
993
994         # Extract title
995         video_title = config["video"]["title"]
996
997         # Extract uploader
998         video_uploader = config["video"]["owner"]["name"]
999
1000         # Extract video thumbnail
1001         video_thumbnail = config["video"]["thumbnail"]
1002
1003         # Extract video description
1004         video_description = get_element_by_attribute("itemprop", "description", webpage)
1005         if video_description: video_description = clean_html(video_description)
1006         else: video_description = ''
1007
1008         # Extract upload date
1009         video_upload_date = None
1010         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1011         if mobj is not None:
1012             video_upload_date = mobj.group(1)
1013
1014         # Vimeo specific: extract request signature and timestamp
1015         sig = config['request']['signature']
1016         timestamp = config['request']['timestamp']
1017
1018         # Vimeo specific: extract video codec and quality information
1019         # First consider quality, then codecs, then take everything
1020         # TODO bind to format param
1021         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1022         files = { 'hd': [], 'sd': [], 'other': []}
1023         for codec_name, codec_extension in codecs:
1024             if codec_name in config["video"]["files"]:
1025                 if 'hd' in config["video"]["files"][codec_name]:
1026                     files['hd'].append((codec_name, codec_extension, 'hd'))
1027                 elif 'sd' in config["video"]["files"][codec_name]:
1028                     files['sd'].append((codec_name, codec_extension, 'sd'))
1029                 else:
1030                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1031
1032         for quality in ('hd', 'sd', 'other'):
1033             if len(files[quality]) > 0:
1034                 video_quality = files[quality][0][2]
1035                 video_codec = files[quality][0][0]
1036                 video_extension = files[quality][0][1]
1037                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1038                 break
1039         else:
1040             self._downloader.trouble(u'ERROR: no known codec found')
1041             return
1042
1043         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1044                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1045
1046         return [{
1047             'id':       video_id,
1048             'url':      video_url,
1049             'uploader': video_uploader,
1050             'upload_date':  video_upload_date,
1051             'title':    video_title,
1052             'ext':      video_extension,
1053             'thumbnail':    video_thumbnail,
1054             'description':  video_description,
1055         }]
1056
1057
1058 class ArteTvIE(InfoExtractor):
1059     """arte.tv information extractor."""
1060
1061     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1062     _LIVE_URL = r'index-[0-9]+\.html$'
1063
1064     IE_NAME = u'arte.tv'
1065
1066     def __init__(self, downloader=None):
1067         InfoExtractor.__init__(self, downloader)
1068
1069     def report_download_webpage(self, video_id):
1070         """Report webpage download."""
1071         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1072
1073     def report_extraction(self, video_id):
1074         """Report information extraction."""
1075         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1076
1077     def fetch_webpage(self, url):
1078         self._downloader.increment_downloads()
1079         request = compat_urllib_request.Request(url)
1080         try:
1081             self.report_download_webpage(url)
1082             webpage = compat_urllib_request.urlopen(request).read()
1083         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1084             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1085             return
1086         except ValueError as err:
1087             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1088             return
1089         return webpage
1090
1091     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1092         page = self.fetch_webpage(url)
1093         mobj = re.search(regex, page, regexFlags)
1094         info = {}
1095
1096         if mobj is None:
1097             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1098             return
1099
1100         for (i, key, err) in matchTuples:
1101             if mobj.group(i) is None:
1102                 self._downloader.trouble(err)
1103                 return
1104             else:
1105                 info[key] = mobj.group(i)
1106
1107         return info
1108
1109     def extractLiveStream(self, url):
1110         video_lang = url.split('/')[-4]
1111         info = self.grep_webpage(
1112             url,
1113             r'src="(.*?/videothek_js.*?\.js)',
1114             0,
1115             [
1116                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1117             ]
1118         )
1119         http_host = url.split('/')[2]
1120         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1121         info = self.grep_webpage(
1122             next_url,
1123             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1124                 '(http://.*?\.swf).*?' +
1125                 '(rtmp://.*?)\'',
1126             re.DOTALL,
1127             [
1128                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1129                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1130                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1131             ]
1132         )
1133         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1134
1135     def extractPlus7Stream(self, url):
1136         video_lang = url.split('/')[-3]
1137         info = self.grep_webpage(
1138             url,
1139             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1140             0,
1141             [
1142                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1143             ]
1144         )
1145         next_url = compat_urllib_parse.unquote(info.get('url'))
1146         info = self.grep_webpage(
1147             next_url,
1148             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1149             0,
1150             [
1151                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1152             ]
1153         )
1154         next_url = compat_urllib_parse.unquote(info.get('url'))
1155
1156         info = self.grep_webpage(
1157             next_url,
1158             r'<video id="(.*?)".*?>.*?' +
1159                 '<name>(.*?)</name>.*?' +
1160                 '<dateVideo>(.*?)</dateVideo>.*?' +
1161                 '<url quality="hd">(.*?)</url>',
1162             re.DOTALL,
1163             [
1164                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1165                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1166                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1167                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1168             ]
1169         )
1170
1171         return {
1172             'id':           info.get('id'),
1173             'url':          compat_urllib_parse.unquote(info.get('url')),
1174             'uploader':     u'arte.tv',
1175             'upload_date':  info.get('date'),
1176             'title':        info.get('title').decode('utf-8'),
1177             'ext':          u'mp4',
1178             'format':       u'NA',
1179             'player_url':   None,
1180         }
1181
1182     def _real_extract(self, url):
1183         video_id = url.split('/')[-1]
1184         self.report_extraction(video_id)
1185
1186         if re.search(self._LIVE_URL, video_id) is not None:
1187             self.extractLiveStream(url)
1188             return
1189         else:
1190             info = self.extractPlus7Stream(url)
1191
1192         return [info]
1193
1194
1195 class GenericIE(InfoExtractor):
1196     """Generic last-resort information extractor."""
1197
1198     _VALID_URL = r'.*'
1199     IE_NAME = u'generic'
1200
1201     def __init__(self, downloader=None):
1202         InfoExtractor.__init__(self, downloader)
1203
1204     def report_download_webpage(self, video_id):
1205         """Report webpage download."""
1206         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1207         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1208
1209     def report_extraction(self, video_id):
1210         """Report information extraction."""
1211         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1212
1213     def report_following_redirect(self, new_url):
1214         """Report information extraction."""
1215         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1216
1217     def _test_redirect(self, url):
1218         """Check if it is a redirect, like url shorteners, in case restart chain."""
1219         class HeadRequest(compat_urllib_request.Request):
1220             def get_method(self):
1221                 return "HEAD"
1222
1223         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1224             """
1225             Subclass the HTTPRedirectHandler to make it use our
1226             HeadRequest also on the redirected URL
1227             """
1228             def redirect_request(self, req, fp, code, msg, headers, newurl):
1229                 if code in (301, 302, 303, 307):
1230                     newurl = newurl.replace(' ', '%20')
1231                     newheaders = dict((k,v) for k,v in req.headers.items()
1232                                       if k.lower() not in ("content-length", "content-type"))
1233                     return HeadRequest(newurl,
1234                                        headers=newheaders,
1235                                        origin_req_host=req.get_origin_req_host(),
1236                                        unverifiable=True)
1237                 else:
1238                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1239
1240         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1241             """
1242             Fallback to GET if HEAD is not allowed (405 HTTP error)
1243             """
1244             def http_error_405(self, req, fp, code, msg, headers):
1245                 fp.read()
1246                 fp.close()
1247
1248                 newheaders = dict((k,v) for k,v in req.headers.items()
1249                                   if k.lower() not in ("content-length", "content-type"))
1250                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1251                                                  headers=newheaders,
1252                                                  origin_req_host=req.get_origin_req_host(),
1253                                                  unverifiable=True))
1254
1255         # Build our opener
1256         opener = compat_urllib_request.OpenerDirector()
1257         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1258                         HTTPMethodFallback, HEADRedirectHandler,
1259                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1260             opener.add_handler(handler())
1261
1262         response = opener.open(HeadRequest(url))
1263         new_url = response.geturl()
1264
1265         if url == new_url:
1266             return False
1267
1268         self.report_following_redirect(new_url)
1269         self._downloader.download([new_url])
1270         return True
1271
1272     def _real_extract(self, url):
1273         if self._test_redirect(url): return
1274
1275         video_id = url.split('/')[-1]
1276         request = compat_urllib_request.Request(url)
1277         try:
1278             self.report_download_webpage(video_id)
1279             webpage = compat_urllib_request.urlopen(request).read()
1280         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1281             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1282             return
1283         except ValueError as err:
1284             # since this is the last-resort InfoExtractor, if
1285             # this error is thrown, it'll be thrown here
1286             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1287             return
1288
1289         self.report_extraction(video_id)
1290         # Start with something easy: JW Player in SWFObject
1291         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1292         if mobj is None:
1293             # Broaden the search a little bit
1294             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1295         if mobj is None:
1296             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1297             return
1298
1299         # It's possible that one of the regexes
1300         # matched, but returned an empty group:
1301         if mobj.group(1) is None:
1302             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1303             return
1304
1305         video_url = compat_urllib_parse.unquote(mobj.group(1))
1306         video_id = os.path.basename(video_url)
1307
1308         # here's a fun little line of code for you:
1309         video_extension = os.path.splitext(video_id)[1][1:]
1310         video_id = os.path.splitext(video_id)[0]
1311
1312         # it's tempting to parse this further, but you would
1313         # have to take into account all the variations like
1314         #   Video Title - Site Name
1315         #   Site Name | Video Title
1316         #   Video Title - Tagline | Site Name
1317         # and so on and so forth; it's just not practical
1318         mobj = re.search(r'<title>(.*)</title>', webpage)
1319         if mobj is None:
1320             self._downloader.trouble(u'ERROR: unable to extract title')
1321             return
1322         video_title = mobj.group(1)
1323
1324         # video uploader is domain name
1325         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1326         if mobj is None:
1327             self._downloader.trouble(u'ERROR: unable to extract title')
1328             return
1329         video_uploader = mobj.group(1)
1330
1331         return [{
1332             'id':       video_id,
1333             'url':      video_url,
1334             'uploader': video_uploader,
1335             'upload_date':  None,
1336             'title':    video_title,
1337             'ext':      video_extension,
1338         }]
1339
1340
1341 class YoutubeSearchIE(InfoExtractor):
1342     """Information Extractor for YouTube search queries."""
1343     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1344     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1345     _max_youtube_results = 1000
1346     IE_NAME = u'youtube:search'
1347
1348     def __init__(self, downloader=None):
1349         InfoExtractor.__init__(self, downloader)
1350
1351     def report_download_page(self, query, pagenum):
1352         """Report attempt to download search page with given number."""
1353         query = query.decode(preferredencoding())
1354         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1355
1356     def _real_extract(self, query):
1357         mobj = re.match(self._VALID_URL, query)
1358         if mobj is None:
1359             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1360             return
1361
1362         prefix, query = query.split(':')
1363         prefix = prefix[8:]
1364         query = query.encode('utf-8')
1365         if prefix == '':
1366             self._download_n_results(query, 1)
1367             return
1368         elif prefix == 'all':
1369             self._download_n_results(query, self._max_youtube_results)
1370             return
1371         else:
1372             try:
1373                 n = int(prefix)
1374                 if n <= 0:
1375                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1376                     return
1377                 elif n > self._max_youtube_results:
1378                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1379                     n = self._max_youtube_results
1380                 self._download_n_results(query, n)
1381                 return
1382             except ValueError: # parsing prefix as integer fails
1383                 self._download_n_results(query, 1)
1384                 return
1385
1386     def _download_n_results(self, query, n):
1387         """Downloads a specified number of results for a query"""
1388
1389         video_ids = []
1390         pagenum = 0
1391         limit = n
1392
1393         while (50 * pagenum) < limit:
1394             self.report_download_page(query, pagenum+1)
1395             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1396             request = compat_urllib_request.Request(result_url)
1397             try:
1398                 data = compat_urllib_request.urlopen(request).read()
1399             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1400                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1401                 return
1402             api_response = json.loads(data)['data']
1403
1404             new_ids = list(video['id'] for video in api_response['items'])
1405             video_ids += new_ids
1406
1407             limit = min(n, api_response['totalItems'])
1408             pagenum += 1
1409
1410         if len(video_ids) > n:
1411             video_ids = video_ids[:n]
1412         for id in video_ids:
1413             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1414         return
1415
1416
1417 class GoogleSearchIE(InfoExtractor):
1418     """Information Extractor for Google Video search queries."""
1419     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1420     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1421     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1422     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1423     _max_google_results = 1000
1424     IE_NAME = u'video.google:search'
1425
1426     def __init__(self, downloader=None):
1427         InfoExtractor.__init__(self, downloader)
1428
1429     def report_download_page(self, query, pagenum):
1430         """Report attempt to download playlist page with given number."""
1431         query = query.decode(preferredencoding())
1432         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1433
1434     def _real_extract(self, query):
1435         mobj = re.match(self._VALID_URL, query)
1436         if mobj is None:
1437             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1438             return
1439
1440         prefix, query = query.split(':')
1441         prefix = prefix[8:]
1442         query = query.encode('utf-8')
1443         if prefix == '':
1444             self._download_n_results(query, 1)
1445             return
1446         elif prefix == 'all':
1447             self._download_n_results(query, self._max_google_results)
1448             return
1449         else:
1450             try:
1451                 n = int(prefix)
1452                 if n <= 0:
1453                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454                     return
1455                 elif n > self._max_google_results:
1456                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1457                     n = self._max_google_results
1458                 self._download_n_results(query, n)
1459                 return
1460             except ValueError: # parsing prefix as integer fails
1461                 self._download_n_results(query, 1)
1462                 return
1463
1464     def _download_n_results(self, query, n):
1465         """Downloads a specified number of results for a query"""
1466
1467         video_ids = []
1468         pagenum = 0
1469
1470         while True:
1471             self.report_download_page(query, pagenum)
1472             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1473             request = compat_urllib_request.Request(result_url)
1474             try:
1475                 page = compat_urllib_request.urlopen(request).read()
1476             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1478                 return
1479
1480             # Extract video identifiers
1481             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1482                 video_id = mobj.group(1)
1483                 if video_id not in video_ids:
1484                     video_ids.append(video_id)
1485                     if len(video_ids) == n:
1486                         # Specified n videos reached
1487                         for id in video_ids:
1488                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1489                         return
1490
1491             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1492                 for id in video_ids:
1493                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1494                 return
1495
1496             pagenum = pagenum + 1
1497
1498
1499 class YahooSearchIE(InfoExtractor):
1500     """Information Extractor for Yahoo! Video search queries."""
1501
1502     _WORKING = False
1503     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1504     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1505     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1506     _MORE_PAGES_INDICATOR = r'\s*Next'
1507     _max_yahoo_results = 1000
1508     IE_NAME = u'video.yahoo:search'
1509
1510     def __init__(self, downloader=None):
1511         InfoExtractor.__init__(self, downloader)
1512
1513     def report_download_page(self, query, pagenum):
1514         """Report attempt to download playlist page with given number."""
1515         query = query.decode(preferredencoding())
1516         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1517
1518     def _real_extract(self, query):
1519         mobj = re.match(self._VALID_URL, query)
1520         if mobj is None:
1521             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1522             return
1523
1524         prefix, query = query.split(':')
1525         prefix = prefix[8:]
1526         query = query.encode('utf-8')
1527         if prefix == '':
1528             self._download_n_results(query, 1)
1529             return
1530         elif prefix == 'all':
1531             self._download_n_results(query, self._max_yahoo_results)
1532             return
1533         else:
1534             try:
1535                 n = int(prefix)
1536                 if n <= 0:
1537                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1538                     return
1539                 elif n > self._max_yahoo_results:
1540                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1541                     n = self._max_yahoo_results
1542                 self._download_n_results(query, n)
1543                 return
1544             except ValueError: # parsing prefix as integer fails
1545                 self._download_n_results(query, 1)
1546                 return
1547
1548     def _download_n_results(self, query, n):
1549         """Downloads a specified number of results for a query"""
1550
1551         video_ids = []
1552         already_seen = set()
1553         pagenum = 1
1554
1555         while True:
1556             self.report_download_page(query, pagenum)
1557             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1558             request = compat_urllib_request.Request(result_url)
1559             try:
1560                 page = compat_urllib_request.urlopen(request).read()
1561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563                 return
1564
1565             # Extract video identifiers
1566             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567                 video_id = mobj.group(1)
1568                 if video_id not in already_seen:
1569                     video_ids.append(video_id)
1570                     already_seen.add(video_id)
1571                     if len(video_ids) == n:
1572                         # Specified n videos reached
1573                         for id in video_ids:
1574                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1575                         return
1576
1577             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578                 for id in video_ids:
1579                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1580                 return
1581
1582             pagenum = pagenum + 1
1583
1584
1585 class YoutubePlaylistIE(InfoExtractor):
1586     """Information Extractor for YouTube playlists."""
1587
1588     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1589     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1590     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1591     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1592     IE_NAME = u'youtube:playlist'
1593
1594     def __init__(self, downloader=None):
1595         InfoExtractor.__init__(self, downloader)
1596
1597     def report_download_page(self, playlist_id, pagenum):
1598         """Report attempt to download playlist page with given number."""
1599         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1600
1601     def _real_extract(self, url):
1602         # Extract playlist id
1603         mobj = re.match(self._VALID_URL, url)
1604         if mobj is None:
1605             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1606             return
1607
1608         # Single video case
1609         if mobj.group(3) is not None:
1610             self._downloader.download([mobj.group(3)])
1611             return
1612
1613         # Download playlist pages
1614         # prefix is 'p' as default for playlists but there are other types that need extra care
1615         playlist_prefix = mobj.group(1)
1616         if playlist_prefix == 'a':
1617             playlist_access = 'artist'
1618         else:
1619             playlist_prefix = 'p'
1620             playlist_access = 'view_play_list'
1621         playlist_id = mobj.group(2)
1622         video_ids = []
1623         pagenum = 1
1624
1625         while True:
1626             self.report_download_page(playlist_id, pagenum)
1627             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1628             request = compat_urllib_request.Request(url)
1629             try:
1630                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1631             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1632                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1633                 return
1634
1635             # Extract video identifiers
1636             ids_in_page = []
1637             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1638                 if mobj.group(1) not in ids_in_page:
1639                     ids_in_page.append(mobj.group(1))
1640             video_ids.extend(ids_in_page)
1641
1642             if self._MORE_PAGES_INDICATOR not in page:
1643                 break
1644             pagenum = pagenum + 1
1645
1646         total = len(video_ids)
1647
1648         playliststart = self._downloader.params.get('playliststart', 1) - 1
1649         playlistend = self._downloader.params.get('playlistend', -1)
1650         if playlistend == -1:
1651             video_ids = video_ids[playliststart:]
1652         else:
1653             video_ids = video_ids[playliststart:playlistend]
1654
1655         if len(video_ids) == total:
1656             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1657         else:
1658             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1659
1660         for id in video_ids:
1661             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1662         return
1663
1664
1665 class YoutubeChannelIE(InfoExtractor):
1666     """Information Extractor for YouTube channels."""
1667
1668     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1669     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1670     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1671     IE_NAME = u'youtube:channel'
1672
1673     def report_download_page(self, channel_id, pagenum):
1674         """Report attempt to download channel page with given number."""
1675         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1676
1677     def _real_extract(self, url):
1678         # Extract channel id
1679         mobj = re.match(self._VALID_URL, url)
1680         if mobj is None:
1681             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1682             return
1683
1684         # Download channel pages
1685         channel_id = mobj.group(1)
1686         video_ids = []
1687         pagenum = 1
1688
1689         while True:
1690             self.report_download_page(channel_id, pagenum)
1691             url = self._TEMPLATE_URL % (channel_id, pagenum)
1692             request = compat_urllib_request.Request(url)
1693             try:
1694                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1695             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1696                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1697                 return
1698
1699             # Extract video identifiers
1700             ids_in_page = []
1701             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1702                 if mobj.group(1) not in ids_in_page:
1703                     ids_in_page.append(mobj.group(1))
1704             video_ids.extend(ids_in_page)
1705
1706             if self._MORE_PAGES_INDICATOR not in page:
1707                 break
1708             pagenum = pagenum + 1
1709
1710         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1711
1712         for id in video_ids:
1713             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1714         return
1715
1716
1717 class YoutubeUserIE(InfoExtractor):
1718     """Information Extractor for YouTube users."""
1719
1720     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1721     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1722     _GDATA_PAGE_SIZE = 50
1723     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1724     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1725     IE_NAME = u'youtube:user'
1726
1727     def __init__(self, downloader=None):
1728         InfoExtractor.__init__(self, downloader)
1729
1730     def report_download_page(self, username, start_index):
1731         """Report attempt to download user page."""
1732         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1733                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1734
1735     def _real_extract(self, url):
1736         # Extract username
1737         mobj = re.match(self._VALID_URL, url)
1738         if mobj is None:
1739             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1740             return
1741
1742         username = mobj.group(1)
1743
1744         # Download video ids using YouTube Data API. Result size per
1745         # query is limited (currently to 50 videos) so we need to query
1746         # page by page until there are no video ids - it means we got
1747         # all of them.
1748
1749         video_ids = []
1750         pagenum = 0
1751
1752         while True:
1753             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1754             self.report_download_page(username, start_index)
1755
1756             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1757
1758             try:
1759                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1760             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1761                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1762                 return
1763
1764             # Extract video identifiers
1765             ids_in_page = []
1766
1767             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1768                 if mobj.group(1) not in ids_in_page:
1769                     ids_in_page.append(mobj.group(1))
1770
1771             video_ids.extend(ids_in_page)
1772
1773             # A little optimization - if current page is not
1774             # "full", ie. does not contain PAGE_SIZE video ids then
1775             # we can assume that this page is the last one - there
1776             # are no more ids on further pages - no need to query
1777             # again.
1778
1779             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1780                 break
1781
1782             pagenum += 1
1783
1784         all_ids_count = len(video_ids)
1785         playliststart = self._downloader.params.get('playliststart', 1) - 1
1786         playlistend = self._downloader.params.get('playlistend', -1)
1787
1788         if playlistend == -1:
1789             video_ids = video_ids[playliststart:]
1790         else:
1791             video_ids = video_ids[playliststart:playlistend]
1792
1793         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1794                 (username, all_ids_count, len(video_ids)))
1795
1796         for video_id in video_ids:
1797             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1798
1799
1800 class BlipTVUserIE(InfoExtractor):
1801     """Information Extractor for blip.tv users."""
1802
1803     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1804     _PAGE_SIZE = 12
1805     IE_NAME = u'blip.tv:user'
1806
1807     def __init__(self, downloader=None):
1808         InfoExtractor.__init__(self, downloader)
1809
1810     def report_download_page(self, username, pagenum):
1811         """Report attempt to download user page."""
1812         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1813                 (self.IE_NAME, username, pagenum))
1814
1815     def _real_extract(self, url):
1816         # Extract username
1817         mobj = re.match(self._VALID_URL, url)
1818         if mobj is None:
1819             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1820             return
1821
1822         username = mobj.group(1)
1823
1824         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1825
1826         request = compat_urllib_request.Request(url)
1827
1828         try:
1829             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1830             mobj = re.search(r'data-users-id="([^"]+)"', page)
1831             page_base = page_base % mobj.group(1)
1832         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1833             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1834             return
1835
1836
1837         # Download video ids using BlipTV Ajax calls. Result size per
1838         # query is limited (currently to 12 videos) so we need to query
1839         # page by page until there are no video ids - it means we got
1840         # all of them.
1841
1842         video_ids = []
1843         pagenum = 1
1844
1845         while True:
1846             self.report_download_page(username, pagenum)
1847
1848             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1849
1850             try:
1851                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1853                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854                 return
1855
1856             # Extract video identifiers
1857             ids_in_page = []
1858
1859             for mobj in re.finditer(r'href="/([^"]+)"', page):
1860                 if mobj.group(1) not in ids_in_page:
1861                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1862
1863             video_ids.extend(ids_in_page)
1864
1865             # A little optimization - if current page is not
1866             # "full", ie. does not contain PAGE_SIZE video ids then
1867             # we can assume that this page is the last one - there
1868             # are no more ids on further pages - no need to query
1869             # again.
1870
1871             if len(ids_in_page) < self._PAGE_SIZE:
1872                 break
1873
1874             pagenum += 1
1875
1876         all_ids_count = len(video_ids)
1877         playliststart = self._downloader.params.get('playliststart', 1) - 1
1878         playlistend = self._downloader.params.get('playlistend', -1)
1879
1880         if playlistend == -1:
1881             video_ids = video_ids[playliststart:]
1882         else:
1883             video_ids = video_ids[playliststart:playlistend]
1884
1885         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1886                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1887
1888         for video_id in video_ids:
1889             self._downloader.download([u'http://blip.tv/'+video_id])
1890
1891
1892 class DepositFilesIE(InfoExtractor):
1893     """Information extractor for depositfiles.com"""
1894
1895     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1896     IE_NAME = u'DepositFiles'
1897
1898     def __init__(self, downloader=None):
1899         InfoExtractor.__init__(self, downloader)
1900
1901     def report_download_webpage(self, file_id):
1902         """Report webpage download."""
1903         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1904
1905     def report_extraction(self, file_id):
1906         """Report information extraction."""
1907         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1908
1909     def _real_extract(self, url):
1910         file_id = url.split('/')[-1]
1911         # Rebuild url in english locale
1912         url = 'http://depositfiles.com/en/files/' + file_id
1913
1914         # Retrieve file webpage with 'Free download' button pressed
1915         free_download_indication = { 'gateway_result' : '1' }
1916         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1917         try:
1918             self.report_download_webpage(file_id)
1919             webpage = compat_urllib_request.urlopen(request).read()
1920         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1921             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1922             return
1923
1924         # Search for the real file URL
1925         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1926         if (mobj is None) or (mobj.group(1) is None):
1927             # Try to figure out reason of the error.
1928             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1929             if (mobj is not None) and (mobj.group(1) is not None):
1930                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1931                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1932             else:
1933                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1934             return
1935
1936         file_url = mobj.group(1)
1937         file_extension = os.path.splitext(file_url)[1][1:]
1938
1939         # Search for file title
1940         mobj = re.search(r'<b title="(.*?)">', webpage)
1941         if mobj is None:
1942             self._downloader.trouble(u'ERROR: unable to extract title')
1943             return
1944         file_title = mobj.group(1).decode('utf-8')
1945
1946         return [{
1947             'id':       file_id.decode('utf-8'),
1948             'url':      file_url.decode('utf-8'),
1949             'uploader': None,
1950             'upload_date':  None,
1951             'title':    file_title,
1952             'ext':      file_extension.decode('utf-8'),
1953         }]
1954
1955
1956 class FacebookIE(InfoExtractor):
1957     """Information Extractor for Facebook"""
1958
1959     _WORKING = False
1960     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1961     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1962     _NETRC_MACHINE = 'facebook'
1963     _available_formats = ['video', 'highqual', 'lowqual']
1964     _video_extensions = {
1965         'video': 'mp4',
1966         'highqual': 'mp4',
1967         'lowqual': 'mp4',
1968     }
1969     IE_NAME = u'facebook'
1970
1971     def __init__(self, downloader=None):
1972         InfoExtractor.__init__(self, downloader)
1973
1974     def _reporter(self, message):
1975         """Add header and report message."""
1976         self._downloader.to_screen(u'[facebook] %s' % message)
1977
1978     def report_login(self):
1979         """Report attempt to log in."""
1980         self._reporter(u'Logging in')
1981
1982     def report_video_webpage_download(self, video_id):
1983         """Report attempt to download video webpage."""
1984         self._reporter(u'%s: Downloading video webpage' % video_id)
1985
1986     def report_information_extraction(self, video_id):
1987         """Report attempt to extract video information."""
1988         self._reporter(u'%s: Extracting video information' % video_id)
1989
1990     def _parse_page(self, video_webpage):
1991         """Extract video information from page"""
1992         # General data
1993         data = {'title': r'\("video_title", "(.*?)"\)',
1994             'description': r'<div class="datawrap">(.*?)</div>',
1995             'owner': r'\("video_owner_name", "(.*?)"\)',
1996             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1997             }
1998         video_info = {}
1999         for piece in data.keys():
2000             mobj = re.search(data[piece], video_webpage)
2001             if mobj is not None:
2002                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2003
2004         # Video urls
2005         video_urls = {}
2006         for fmt in self._available_formats:
2007             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2008             if mobj is not None:
2009                 # URL is in a Javascript segment inside an escaped Unicode format within
2010                 # the generally utf-8 page
2011                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2012         video_info['video_urls'] = video_urls
2013
2014         return video_info
2015
2016     def _real_initialize(self):
2017         if self._downloader is None:
2018             return
2019
2020         useremail = None
2021         password = None
2022         downloader_params = self._downloader.params
2023
2024         # Attempt to use provided username and password or .netrc data
2025         if downloader_params.get('username', None) is not None:
2026             useremail = downloader_params['username']
2027             password = downloader_params['password']
2028         elif downloader_params.get('usenetrc', False):
2029             try:
2030                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2031                 if info is not None:
2032                     useremail = info[0]
2033                     password = info[2]
2034                 else:
2035                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2036             except (IOError, netrc.NetrcParseError) as err:
2037                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2038                 return
2039
2040         if useremail is None:
2041             return
2042
2043         # Log in
2044         login_form = {
2045             'email': useremail,
2046             'pass': password,
2047             'login': 'Log+In'
2048             }
2049         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2050         try:
2051             self.report_login()
2052             login_results = compat_urllib_request.urlopen(request).read()
2053             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2054                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2055                 return
2056         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2057             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2058             return
2059
2060     def _real_extract(self, url):
2061         mobj = re.match(self._VALID_URL, url)
2062         if mobj is None:
2063             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2064             return
2065         video_id = mobj.group('ID')
2066
2067         # Get video webpage
2068         self.report_video_webpage_download(video_id)
2069         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2070         try:
2071             page = compat_urllib_request.urlopen(request)
2072             video_webpage = page.read()
2073         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2074             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2075             return
2076
2077         # Start extracting information
2078         self.report_information_extraction(video_id)
2079
2080         # Extract information
2081         video_info = self._parse_page(video_webpage)
2082
2083         # uploader
2084         if 'owner' not in video_info:
2085             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2086             return
2087         video_uploader = video_info['owner']
2088
2089         # title
2090         if 'title' not in video_info:
2091             self._downloader.trouble(u'ERROR: unable to extract video title')
2092             return
2093         video_title = video_info['title']
2094         video_title = video_title.decode('utf-8')
2095
2096         # thumbnail image
2097         if 'thumbnail' not in video_info:
2098             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2099             video_thumbnail = ''
2100         else:
2101             video_thumbnail = video_info['thumbnail']
2102
2103         # upload date
2104         upload_date = None
2105         if 'upload_date' in video_info:
2106             upload_time = video_info['upload_date']
2107             timetuple = email.utils.parsedate_tz(upload_time)
2108             if timetuple is not None:
2109                 try:
2110                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2111                 except:
2112                     pass
2113
2114         # description
2115         video_description = video_info.get('description', 'No description available.')
2116
2117         url_map = video_info['video_urls']
2118         if len(url_map.keys()) > 0:
2119             # Decide which formats to download
2120             req_format = self._downloader.params.get('format', None)
2121             format_limit = self._downloader.params.get('format_limit', None)
2122
2123             if format_limit is not None and format_limit in self._available_formats:
2124                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2125             else:
2126                 format_list = self._available_formats
2127             existing_formats = [x for x in format_list if x in url_map]
2128             if len(existing_formats) == 0:
2129                 self._downloader.trouble(u'ERROR: no known formats available for video')
2130                 return
2131             if req_format is None:
2132                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2133             elif req_format == 'worst':
2134                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2135             elif req_format == '-1':
2136                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2137             else:
2138                 # Specific format
2139                 if req_format not in url_map:
2140                     self._downloader.trouble(u'ERROR: requested format not available')
2141                     return
2142                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2143
2144         results = []
2145         for format_param, video_real_url in video_url_list:
2146             # Extension
2147             video_extension = self._video_extensions.get(format_param, 'mp4')
2148
2149             results.append({
2150                 'id':       video_id.decode('utf-8'),
2151                 'url':      video_real_url.decode('utf-8'),
2152                 'uploader': video_uploader.decode('utf-8'),
2153                 'upload_date':  upload_date,
2154                 'title':    video_title,
2155                 'ext':      video_extension.decode('utf-8'),
2156                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2157                 'thumbnail':    video_thumbnail.decode('utf-8'),
2158                 'description':  video_description.decode('utf-8'),
2159             })
2160         return results
2161
2162 class BlipTVIE(InfoExtractor):
2163     """Information extractor for blip.tv"""
2164
2165     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2166     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2167     IE_NAME = u'blip.tv'
2168
2169     def report_extraction(self, file_id):
2170         """Report information extraction."""
2171         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2172
2173     def report_direct_download(self, title):
2174         """Report information extraction."""
2175         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2176
2177     def _real_extract(self, url):
2178         mobj = re.match(self._VALID_URL, url)
2179         if mobj is None:
2180             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2181             return
2182
2183         if '?' in url:
2184             cchar = '&'
2185         else:
2186             cchar = '?'
2187         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2188         request = compat_urllib_request.Request(json_url)
2189         self.report_extraction(mobj.group(1))
2190         info = None
2191         try:
2192             urlh = compat_urllib_request.urlopen(request)
2193             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2194                 basename = url.split('/')[-1]
2195                 title,ext = os.path.splitext(basename)
2196                 title = title.decode('UTF-8')
2197                 ext = ext.replace('.', '')
2198                 self.report_direct_download(title)
2199                 info = {
2200                     'id': title,
2201                     'url': url,
2202                     'uploader': None,
2203                     'upload_date': None,
2204                     'title': title,
2205                     'ext': ext,
2206                     'urlhandle': urlh
2207                 }
2208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2209             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2210             return
2211         if info is None: # Regular URL
2212             try:
2213                 json_code_bytes = urlh.read()
2214                 json_code = json_code_bytes.decode('utf-8')
2215             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2216                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2217                 return
2218
2219             try:
2220                 json_data = json.loads(json_code)
2221                 if 'Post' in json_data:
2222                     data = json_data['Post']
2223                 else:
2224                     data = json_data
2225
2226                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2227                 video_url = data['media']['url']
2228                 umobj = re.match(self._URL_EXT, video_url)
2229                 if umobj is None:
2230                     raise ValueError('Can not determine filename extension')
2231                 ext = umobj.group(1)
2232
2233                 info = {
2234                     'id': data['item_id'],
2235                     'url': video_url,
2236                     'uploader': data['display_name'],
2237                     'upload_date': upload_date,
2238                     'title': data['title'],
2239                     'ext': ext,
2240                     'format': data['media']['mimeType'],
2241                     'thumbnail': data['thumbnailUrl'],
2242                     'description': data['description'],
2243                     'player_url': data['embedUrl']
2244                 }
2245             except (ValueError,KeyError) as err:
2246                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2247                 return
2248
2249         std_headers['User-Agent'] = 'iTunes/10.6.1'
2250         return [info]
2251
2252
2253 class MyVideoIE(InfoExtractor):
2254     """Information Extractor for myvideo.de."""
2255
2256     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2257     IE_NAME = u'myvideo'
2258
2259     def __init__(self, downloader=None):
2260         InfoExtractor.__init__(self, downloader)
2261
2262     def report_download_webpage(self, video_id):
2263         """Report webpage download."""
2264         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2265
2266     def report_extraction(self, video_id):
2267         """Report information extraction."""
2268         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2269
2270     def _real_extract(self,url):
2271         mobj = re.match(self._VALID_URL, url)
2272         if mobj is None:
2273             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2274             return
2275
2276         video_id = mobj.group(1)
2277
2278         # Get video webpage
2279         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2280         try:
2281             self.report_download_webpage(video_id)
2282             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2283         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2284             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2285             return
2286
2287         self.report_extraction(video_id)
2288         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2289                  webpage)
2290         if mobj is None:
2291             self._downloader.trouble(u'ERROR: unable to extract media URL')
2292             return
2293         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2294
2295         mobj = re.search('<title>([^<]+)</title>', webpage)
2296         if mobj is None:
2297             self._downloader.trouble(u'ERROR: unable to extract title')
2298             return
2299
2300         video_title = mobj.group(1)
2301
2302         return [{
2303             'id':       video_id,
2304             'url':      video_url,
2305             'uploader': None,
2306             'upload_date':  None,
2307             'title':    video_title,
2308             'ext':      u'flv',
2309         }]
2310
2311 class ComedyCentralIE(InfoExtractor):
2312     """Information extractor for The Daily Show and Colbert Report """
2313
2314     # urls can be abbreviations like :thedailyshow or :colbert
2315     # urls for episodes like:
2316     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2317     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2318     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2319     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2320                       |(https?://)?(www\.)?
2321                           (?P<showname>thedailyshow|colbertnation)\.com/
2322                          (full-episodes/(?P<episode>.*)|
2323                           (?P<clip>
2324                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2325                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2326                      $"""
2327     IE_NAME = u'comedycentral'
2328
2329     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2330
2331     _video_extensions = {
2332         '3500': 'mp4',
2333         '2200': 'mp4',
2334         '1700': 'mp4',
2335         '1200': 'mp4',
2336         '750': 'mp4',
2337         '400': 'mp4',
2338     }
2339     _video_dimensions = {
2340         '3500': '1280x720',
2341         '2200': '960x540',
2342         '1700': '768x432',
2343         '1200': '640x360',
2344         '750': '512x288',
2345         '400': '384x216',
2346     }
2347
2348     def suitable(self, url):
2349         """Receives a URL and returns True if suitable for this IE."""
2350         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2351
2352     def report_extraction(self, episode_id):
2353         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2354
2355     def report_config_download(self, episode_id):
2356         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2357
2358     def report_index_download(self, episode_id):
2359         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2360
2361     def report_player_url(self, episode_id):
2362         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2363
2364
2365     def _print_formats(self, formats):
2366         print('Available formats:')
2367         for x in formats:
2368             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2369
2370
2371     def _real_extract(self, url):
2372         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2373         if mobj is None:
2374             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2375             return
2376
2377         if mobj.group('shortname'):
2378             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2379                 url = u'http://www.thedailyshow.com/full-episodes/'
2380             else:
2381                 url = u'http://www.colbertnation.com/full-episodes/'
2382             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2383             assert mobj is not None
2384
2385         if mobj.group('clip'):
2386             if mobj.group('showname') == 'thedailyshow':
2387                 epTitle = mobj.group('tdstitle')
2388             else:
2389                 epTitle = mobj.group('cntitle')
2390             dlNewest = False
2391         else:
2392             dlNewest = not mobj.group('episode')
2393             if dlNewest:
2394                 epTitle = mobj.group('showname')
2395             else:
2396                 epTitle = mobj.group('episode')
2397
2398         req = compat_urllib_request.Request(url)
2399         self.report_extraction(epTitle)
2400         try:
2401             htmlHandle = compat_urllib_request.urlopen(req)
2402             html = htmlHandle.read()
2403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2404             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2405             return
2406         if dlNewest:
2407             url = htmlHandle.geturl()
2408             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409             if mobj is None:
2410                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2411                 return
2412             if mobj.group('episode') == '':
2413                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2414                 return
2415             epTitle = mobj.group('episode')
2416
2417         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2418
2419         if len(mMovieParams) == 0:
2420             # The Colbert Report embeds the information in a without
2421             # a URL prefix; so extract the alternate reference
2422             # and then add the URL prefix manually.
2423
2424             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2425             if len(altMovieParams) == 0:
2426                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2427                 return
2428             else:
2429                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2430
2431         playerUrl_raw = mMovieParams[0][0]
2432         self.report_player_url(epTitle)
2433         try:
2434             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2435             playerUrl = urlHandle.geturl()
2436         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2437             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2438             return
2439
2440         uri = mMovieParams[0][1]
2441         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2442         self.report_index_download(epTitle)
2443         try:
2444             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2447             return
2448
2449         results = []
2450
2451         idoc = xml.etree.ElementTree.fromstring(indexXml)
2452         itemEls = idoc.findall('.//item')
2453         for itemEl in itemEls:
2454             mediaId = itemEl.findall('./guid')[0].text
2455             shortMediaId = mediaId.split(':')[-1]
2456             showId = mediaId.split(':')[-2].replace('.com', '')
2457             officialTitle = itemEl.findall('./title')[0].text
2458             officialDate = itemEl.findall('./pubDate')[0].text
2459
2460             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2461                         compat_urllib_parse.urlencode({'uri': mediaId}))
2462             configReq = compat_urllib_request.Request(configUrl)
2463             self.report_config_download(epTitle)
2464             try:
2465                 configXml = compat_urllib_request.urlopen(configReq).read()
2466             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2468                 return
2469
2470             cdoc = xml.etree.ElementTree.fromstring(configXml)
2471             turls = []
2472             for rendition in cdoc.findall('.//rendition'):
2473                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2474                 turls.append(finfo)
2475
2476             if len(turls) == 0:
2477                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2478                 continue
2479
2480             if self._downloader.params.get('listformats', None):
2481                 self._print_formats([i[0] for i in turls])
2482                 return
2483
2484             # For now, just pick the highest bitrate
2485             format,video_url = turls[-1]
2486
2487             # Get the format arg from the arg stream
2488             req_format = self._downloader.params.get('format', None)
2489
2490             # Select format if we can find one
2491             for f,v in turls:
2492                 if f == req_format:
2493                     format, video_url = f, v
2494                     break
2495
2496             # Patch to download from alternative CDN, which does not
2497             # break on current RTMPDump builds
2498             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2499             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2500
2501             if video_url.startswith(broken_cdn):
2502                 video_url = video_url.replace(broken_cdn, better_cdn)
2503
2504             effTitle = showId + u'-' + epTitle
2505             info = {
2506                 'id': shortMediaId,
2507                 'url': video_url,
2508                 'uploader': showId,
2509                 'upload_date': officialDate,
2510                 'title': effTitle,
2511                 'ext': 'mp4',
2512                 'format': format,
2513                 'thumbnail': None,
2514                 'description': officialTitle,
2515                 'player_url': None #playerUrl
2516             }
2517
2518             results.append(info)
2519
2520         return results
2521
2522
2523 class EscapistIE(InfoExtractor):
2524     """Information extractor for The Escapist """
2525
2526     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2527     IE_NAME = u'escapist'
2528
2529     def report_extraction(self, showName):
2530         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2531
2532     def report_config_download(self, showName):
2533         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2534
2535     def _real_extract(self, url):
2536         mobj = re.match(self._VALID_URL, url)
2537         if mobj is None:
2538             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2539             return
2540         showName = mobj.group('showname')
2541         videoId = mobj.group('episode')
2542
2543         self.report_extraction(showName)
2544         try:
2545             webPage = compat_urllib_request.urlopen(url)
2546             webPageBytes = webPage.read()
2547             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2548             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2549         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2551             return
2552
2553         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2554         description = unescapeHTML(descMatch.group(1))
2555         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2556         imgUrl = unescapeHTML(imgMatch.group(1))
2557         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2558         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2559         configUrlMatch = re.search('config=(.*)$', playerUrl)
2560         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2561
2562         self.report_config_download(showName)
2563         try:
2564             configJSON = compat_urllib_request.urlopen(configUrl)
2565             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2566             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2569             return
2570
2571         # Technically, it's JavaScript, not JSON
2572         configJSON = configJSON.replace("'", '"')
2573
2574         try:
2575             config = json.loads(configJSON)
2576         except (ValueError,) as err:
2577             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2578             return
2579
2580         playlist = config['playlist']
2581         videoUrl = playlist[1]['url']
2582
2583         info = {
2584             'id': videoId,
2585             'url': videoUrl,
2586             'uploader': showName,
2587             'upload_date': None,
2588             'title': showName,
2589             'ext': 'flv',
2590             'thumbnail': imgUrl,
2591             'description': description,
2592             'player_url': playerUrl,
2593         }
2594
2595         return [info]
2596
2597
2598 class CollegeHumorIE(InfoExtractor):
2599     """Information extractor for collegehumor.com"""
2600
2601     _WORKING = False
2602     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2603     IE_NAME = u'collegehumor'
2604
2605     def report_manifest(self, video_id):
2606         """Report information extraction."""
2607         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2608
2609     def report_extraction(self, video_id):
2610         """Report information extraction."""
2611         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2612
2613     def _real_extract(self, url):
2614         mobj = re.match(self._VALID_URL, url)
2615         if mobj is None:
2616             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2617             return
2618         video_id = mobj.group('videoid')
2619
2620         info = {
2621             'id': video_id,
2622             'uploader': None,
2623             'upload_date': None,
2624         }
2625
2626         self.report_extraction(video_id)
2627         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2628         try:
2629             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2630         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2631             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2632             return
2633
2634         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2635         try:
2636             videoNode = mdoc.findall('./video')[0]
2637             info['description'] = videoNode.findall('./description')[0].text
2638             info['title'] = videoNode.findall('./caption')[0].text
2639             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2640             manifest_url = videoNode.findall('./file')[0].text
2641         except IndexError:
2642             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2643             return
2644
2645         manifest_url += '?hdcore=2.10.3'
2646         self.report_manifest(video_id)
2647         try:
2648             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2649         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2651             return
2652
2653         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2654         try:
2655             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2656             node_id = media_node.attrib['url']
2657             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2658         except IndexError as err:
2659             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2660             return
2661
2662         url_pr = compat_urllib_parse_urlparse(manifest_url)
2663         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2664
2665         info['url'] = url
2666         info['ext'] = 'f4f'
2667         return [info]
2668
2669
2670 class XVideosIE(InfoExtractor):
2671     """Information extractor for xvideos.com"""
2672
2673     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2674     IE_NAME = u'xvideos'
2675
2676     def report_webpage(self, video_id):
2677         """Report information extraction."""
2678         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2679
2680     def report_extraction(self, video_id):
2681         """Report information extraction."""
2682         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2683
2684     def _real_extract(self, url):
2685         mobj = re.match(self._VALID_URL, url)
2686         if mobj is None:
2687             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2688             return
2689         video_id = mobj.group(1)
2690
2691         self.report_webpage(video_id)
2692
2693         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2694         try:
2695             webpage_bytes = compat_urllib_request.urlopen(request).read()
2696             webpage = webpage_bytes.decode('utf-8', 'replace')
2697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2698             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2699             return
2700
2701         self.report_extraction(video_id)
2702
2703
2704         # Extract video URL
2705         mobj = re.search(r'flv_url=(.+?)&', webpage)
2706         if mobj is None:
2707             self._downloader.trouble(u'ERROR: unable to extract video url')
2708             return
2709         video_url = compat_urllib_parse.unquote(mobj.group(1))
2710
2711
2712         # Extract title
2713         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2714         if mobj is None:
2715             self._downloader.trouble(u'ERROR: unable to extract video title')
2716             return
2717         video_title = mobj.group(1)
2718
2719
2720         # Extract video thumbnail
2721         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2722         if mobj is None:
2723             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2724             return
2725         video_thumbnail = mobj.group(0)
2726
2727         info = {
2728             'id': video_id,
2729             'url': video_url,
2730             'uploader': None,
2731             'upload_date': None,
2732             'title': video_title,
2733             'ext': 'flv',
2734             'thumbnail': video_thumbnail,
2735             'description': None,
2736         }
2737
2738         return [info]
2739
2740
2741 class SoundcloudIE(InfoExtractor):
2742     """Information extractor for soundcloud.com
2743        To access the media, the uid of the song and a stream token
2744        must be extracted from the page source and the script must make
2745        a request to media.soundcloud.com/crossdomain.xml. Then
2746        the media can be grabbed by requesting from an url composed
2747        of the stream token and uid
2748      """
2749
2750     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2751     IE_NAME = u'soundcloud'
2752
2753     def __init__(self, downloader=None):
2754         InfoExtractor.__init__(self, downloader)
2755
2756     def report_resolve(self, video_id):
2757         """Report information extraction."""
2758         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2759
2760     def report_extraction(self, video_id):
2761         """Report information extraction."""
2762         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2763
2764     def _real_extract(self, url):
2765         mobj = re.match(self._VALID_URL, url)
2766         if mobj is None:
2767             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2768             return
2769
2770         # extract uploader (which is in the url)
2771         uploader = mobj.group(1)
2772         # extract simple title (uploader + slug of song title)
2773         slug_title =  mobj.group(2)
2774         simple_title = uploader + u'-' + slug_title
2775
2776         self.report_resolve('%s/%s' % (uploader, slug_title))
2777
2778         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2779         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2780         request = compat_urllib_request.Request(resolv_url)
2781         try:
2782             info_json_bytes = compat_urllib_request.urlopen(request).read()
2783             info_json = info_json_bytes.decode('utf-8')
2784         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2786             return
2787
2788         info = json.loads(info_json)
2789         video_id = info['id']
2790         self.report_extraction('%s/%s' % (uploader, slug_title))
2791
2792         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793         request = compat_urllib_request.Request(streams_url)
2794         try:
2795             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2796             stream_json = stream_json_bytes.decode('utf-8')
2797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2798             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2799             return
2800
2801         streams = json.loads(stream_json)
2802         mediaURL = streams['http_mp3_128_url']
2803
2804         return [{
2805             'id':       info['id'],
2806             'url':      mediaURL,
2807             'uploader': info['user']['username'],
2808             'upload_date':  info['created_at'],
2809             'title':    info['title'],
2810             'ext':      u'mp3',
2811             'description': info['description'],
2812         }]
2813
2814
2815 class InfoQIE(InfoExtractor):
2816     """Information extractor for infoq.com"""
2817
2818     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2819     IE_NAME = u'infoq'
2820
2821     def report_webpage(self, video_id):
2822         """Report information extraction."""
2823         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2824
2825     def report_extraction(self, video_id):
2826         """Report information extraction."""
2827         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2828
2829     def _real_extract(self, url):
2830         mobj = re.match(self._VALID_URL, url)
2831         if mobj is None:
2832             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2833             return
2834
2835         self.report_webpage(url)
2836
2837         request = compat_urllib_request.Request(url)
2838         try:
2839             webpage = compat_urllib_request.urlopen(request).read()
2840         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2841             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2842             return
2843
2844         self.report_extraction(url)
2845
2846
2847         # Extract video URL
2848         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2849         if mobj is None:
2850             self._downloader.trouble(u'ERROR: unable to extract video url')
2851             return
2852         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2853
2854
2855         # Extract title
2856         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2857         if mobj is None:
2858             self._downloader.trouble(u'ERROR: unable to extract video title')
2859             return
2860         video_title = mobj.group(1).decode('utf-8')
2861
2862         # Extract description
2863         video_description = u'No description available.'
2864         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2865         if mobj is not None:
2866             video_description = mobj.group(1).decode('utf-8')
2867
2868         video_filename = video_url.split('/')[-1]
2869         video_id, extension = video_filename.split('.')
2870
2871         info = {
2872             'id': video_id,
2873             'url': video_url,
2874             'uploader': None,
2875             'upload_date': None,
2876             'title': video_title,
2877             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2878             'thumbnail': None,
2879             'description': video_description,
2880         }
2881
2882         return [info]
2883
2884 class MixcloudIE(InfoExtractor):
2885     """Information extractor for www.mixcloud.com"""
2886
2887     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2888     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2889     IE_NAME = u'mixcloud'
2890
2891     def __init__(self, downloader=None):
2892         InfoExtractor.__init__(self, downloader)
2893
2894     def report_download_json(self, file_id):
2895         """Report JSON download."""
2896         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2897
2898     def report_extraction(self, file_id):
2899         """Report information extraction."""
2900         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2901
2902     def get_urls(self, jsonData, fmt, bitrate='best'):
2903         """Get urls from 'audio_formats' section in json"""
2904         file_url = None
2905         try:
2906             bitrate_list = jsonData[fmt]
2907             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2908                 bitrate = max(bitrate_list) # select highest
2909
2910             url_list = jsonData[fmt][bitrate]
2911         except TypeError: # we have no bitrate info.
2912             url_list = jsonData[fmt]
2913         return url_list
2914
2915     def check_urls(self, url_list):
2916         """Returns 1st active url from list"""
2917         for url in url_list:
2918             try:
2919                 compat_urllib_request.urlopen(url)
2920                 return url
2921             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2922                 url = None
2923
2924         return None
2925
2926     def _print_formats(self, formats):
2927         print('Available formats:')
2928         for fmt in formats.keys():
2929             for b in formats[fmt]:
2930                 try:
2931                     ext = formats[fmt][b][0]
2932                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2933                 except TypeError: # we have no bitrate info
2934                     ext = formats[fmt][0]
2935                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2936                     break
2937
2938     def _real_extract(self, url):
2939         mobj = re.match(self._VALID_URL, url)
2940         if mobj is None:
2941             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2942             return
2943         # extract uploader & filename from url
2944         uploader = mobj.group(1).decode('utf-8')
2945         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2946
2947         # construct API request
2948         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2949         # retrieve .json file with links to files
2950         request = compat_urllib_request.Request(file_url)
2951         try:
2952             self.report_download_json(file_url)
2953             jsonData = compat_urllib_request.urlopen(request).read()
2954         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2955             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2956             return
2957
2958         # parse JSON
2959         json_data = json.loads(jsonData)
2960         player_url = json_data['player_swf_url']
2961         formats = dict(json_data['audio_formats'])
2962
2963         req_format = self._downloader.params.get('format', None)
2964         bitrate = None
2965
2966         if self._downloader.params.get('listformats', None):
2967             self._print_formats(formats)
2968             return
2969
2970         if req_format is None or req_format == 'best':
2971             for format_param in formats.keys():
2972                 url_list = self.get_urls(formats, format_param)
2973                 # check urls
2974                 file_url = self.check_urls(url_list)
2975                 if file_url is not None:
2976                     break # got it!
2977         else:
2978             if req_format not in formats.keys():
2979                 self._downloader.trouble(u'ERROR: format is not available')
2980                 return
2981
2982             url_list = self.get_urls(formats, req_format)
2983             file_url = self.check_urls(url_list)
2984             format_param = req_format
2985
2986         return [{
2987             'id': file_id.decode('utf-8'),
2988             'url': file_url.decode('utf-8'),
2989             'uploader': uploader.decode('utf-8'),
2990             'upload_date': None,
2991             'title': json_data['name'],
2992             'ext': file_url.split('.')[-1].decode('utf-8'),
2993             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2994             'thumbnail': json_data['thumbnail_url'],
2995             'description': json_data['description'],
2996             'player_url': player_url.decode('utf-8'),
2997         }]
2998
2999 class StanfordOpenClassroomIE(InfoExtractor):
3000     """Information extractor for Stanford's Open ClassRoom"""
3001
3002     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3003     IE_NAME = u'stanfordoc'
3004
3005     def report_download_webpage(self, objid):
3006         """Report information extraction."""
3007         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3008
3009     def report_extraction(self, video_id):
3010         """Report information extraction."""
3011         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3012
3013     def _real_extract(self, url):
3014         mobj = re.match(self._VALID_URL, url)
3015         if mobj is None:
3016             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3017             return
3018
3019         if mobj.group('course') and mobj.group('video'): # A specific video
3020             course = mobj.group('course')
3021             video = mobj.group('video')
3022             info = {
3023                 'id': course + '_' + video,
3024                 'uploader': None,
3025                 'upload_date': None,
3026             }
3027
3028             self.report_extraction(info['id'])
3029             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3030             xmlUrl = baseUrl + video + '.xml'
3031             try:
3032                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3033             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3034                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3035                 return
3036             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3037             try:
3038                 info['title'] = mdoc.findall('./title')[0].text
3039                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3040             except IndexError:
3041                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3042                 return
3043             info['ext'] = info['url'].rpartition('.')[2]
3044             return [info]
3045         elif mobj.group('course'): # A course page
3046             course = mobj.group('course')
3047             info = {
3048                 'id': course,
3049                 'type': 'playlist',
3050                 'uploader': None,
3051                 'upload_date': None,
3052             }
3053
3054             self.report_download_webpage(info['id'])
3055             try:
3056                 coursepage = compat_urllib_request.urlopen(url).read()
3057             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3058                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3059                 return
3060
3061             m = re.search('<h1>([^<]+)</h1>', coursepage)
3062             if m:
3063                 info['title'] = unescapeHTML(m.group(1))
3064             else:
3065                 info['title'] = info['id']
3066
3067             m = re.search('<description>([^<]+)</description>', coursepage)
3068             if m:
3069                 info['description'] = unescapeHTML(m.group(1))
3070
3071             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3072             info['list'] = [
3073                 {
3074                     'type': 'reference',
3075                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3076                 }
3077                     for vpage in links]
3078             results = []
3079             for entry in info['list']:
3080                 assert entry['type'] == 'reference'
3081                 results += self.extract(entry['url'])
3082             return results
3083
3084         else: # Root page
3085             info = {
3086                 'id': 'Stanford OpenClassroom',
3087                 'type': 'playlist',
3088                 'uploader': None,
3089                 'upload_date': None,
3090             }
3091
3092             self.report_download_webpage(info['id'])
3093             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3094             try:
3095                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3096             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3097                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3098                 return
3099
3100             info['title'] = info['id']
3101
3102             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3103             info['list'] = [
3104                 {
3105                     'type': 'reference',
3106                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3107                 }
3108                     for cpage in links]
3109
3110             results = []
3111             for entry in info['list']:
3112                 assert entry['type'] == 'reference'
3113                 results += self.extract(entry['url'])
3114             return results
3115
3116 class MTVIE(InfoExtractor):
3117     """Information extractor for MTV.com"""
3118
3119     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3120     IE_NAME = u'mtv'
3121
3122     def report_webpage(self, video_id):
3123         """Report information extraction."""
3124         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3125
3126     def report_extraction(self, video_id):
3127         """Report information extraction."""
3128         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3129
3130     def _real_extract(self, url):
3131         mobj = re.match(self._VALID_URL, url)
3132         if mobj is None:
3133             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3134             return
3135         if not mobj.group('proto'):
3136             url = 'http://' + url
3137         video_id = mobj.group('videoid')
3138         self.report_webpage(video_id)
3139
3140         request = compat_urllib_request.Request(url)
3141         try:
3142             webpage = compat_urllib_request.urlopen(request).read()
3143         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3144             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3145             return
3146
3147         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3148         if mobj is None:
3149             self._downloader.trouble(u'ERROR: unable to extract song name')
3150             return
3151         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3152         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3153         if mobj is None:
3154             self._downloader.trouble(u'ERROR: unable to extract performer')
3155             return
3156         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3157         video_title = performer + ' - ' + song_name
3158
3159         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3160         if mobj is None:
3161             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3162             return
3163         mtvn_uri = mobj.group(1)
3164
3165         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3166         if mobj is None:
3167             self._downloader.trouble(u'ERROR: unable to extract content id')
3168             return
3169         content_id = mobj.group(1)
3170
3171         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3172         self.report_extraction(video_id)
3173         request = compat_urllib_request.Request(videogen_url)
3174         try:
3175             metadataXml = compat_urllib_request.urlopen(request).read()
3176         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3177             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3178             return
3179
3180         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3181         renditions = mdoc.findall('.//rendition')
3182
3183         # For now, always pick the highest quality.
3184         rendition = renditions[-1]
3185
3186         try:
3187             _,_,ext = rendition.attrib['type'].partition('/')
3188             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3189             video_url = rendition.find('./src').text
3190         except KeyError:
3191             self._downloader.trouble('Invalid rendition field.')
3192             return
3193
3194         info = {
3195             'id': video_id,
3196             'url': video_url,
3197             'uploader': performer,
3198             'upload_date': None,
3199             'title': video_title,
3200             'ext': ext,
3201             'format': format,
3202         }
3203
3204         return [info]
3205
3206
3207 class YoukuIE(InfoExtractor):
3208
3209     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3210     IE_NAME = u'Youku'
3211
3212     def __init__(self, downloader=None):
3213         InfoExtractor.__init__(self, downloader)
3214
3215     def report_download_webpage(self, file_id):
3216         """Report webpage download."""
3217         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3218
3219     def report_extraction(self, file_id):
3220         """Report information extraction."""
3221         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3222
3223     def _gen_sid(self):
3224         nowTime = int(time.time() * 1000)
3225         random1 = random.randint(1000,1998)
3226         random2 = random.randint(1000,9999)
3227
3228         return "%d%d%d" %(nowTime,random1,random2)
3229
3230     def _get_file_ID_mix_string(self, seed):
3231         mixed = []
3232         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3233         seed = float(seed)
3234         for i in range(len(source)):
3235             seed  =  (seed * 211 + 30031 ) % 65536
3236             index  =  math.floor(seed / 65536 * len(source) )
3237             mixed.append(source[int(index)])
3238             source.remove(source[int(index)])
3239         #return ''.join(mixed)
3240         return mixed
3241
3242     def _get_file_id(self, fileId, seed):
3243         mixed = self._get_file_ID_mix_string(seed)
3244         ids = fileId.split('*')
3245         realId = []
3246         for ch in ids:
3247             if ch:
3248                 realId.append(mixed[int(ch)])
3249         return ''.join(realId)
3250
3251     def _real_extract(self, url):
3252         mobj = re.match(self._VALID_URL, url)
3253         if mobj is None:
3254             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3255             return
3256         video_id = mobj.group('ID')
3257
3258         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3259
3260         request = compat_urllib_request.Request(info_url, None, std_headers)
3261         try:
3262             self.report_download_webpage(video_id)
3263             jsondata = compat_urllib_request.urlopen(request).read()
3264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3265             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3266             return
3267
3268         self.report_extraction(video_id)
3269         try:
3270             jsonstr = jsondata.decode('utf-8')
3271             config = json.loads(jsonstr)
3272
3273             video_title =  config['data'][0]['title']
3274             seed = config['data'][0]['seed']
3275
3276             format = self._downloader.params.get('format', None)
3277             supported_format = config['data'][0]['streamfileids'].keys()
3278
3279             if format is None or format == 'best':
3280                 if 'hd2' in supported_format:
3281                     format = 'hd2'
3282                 else:
3283                     format = 'flv'
3284                 ext = u'flv'
3285             elif format == 'worst':
3286                 format = 'mp4'
3287                 ext = u'mp4'
3288             else:
3289                 format = 'flv'
3290                 ext = u'flv'
3291
3292
3293             fileid = config['data'][0]['streamfileids'][format]
3294             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3295         except (UnicodeDecodeError, ValueError, KeyError):
3296             self._downloader.trouble(u'ERROR: unable to extract info section')
3297             return
3298
3299         files_info=[]
3300         sid = self._gen_sid()
3301         fileid = self._get_file_id(fileid, seed)
3302
3303         #column 8,9 of fileid represent the segment number
3304         #fileid[7:9] should be changed
3305         for index, key in enumerate(keys):
3306
3307             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3308             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3309
3310             info = {
3311                 'id': '%s_part%02d' % (video_id, index),
3312                 'url': download_url,
3313                 'uploader': None,
3314                 'upload_date': None,
3315                 'title': video_title,
3316                 'ext': ext,
3317             }
3318             files_info.append(info)
3319
3320         return files_info
3321
3322
3323 class XNXXIE(InfoExtractor):
3324     """Information extractor for xnxx.com"""
3325
3326     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3327     IE_NAME = u'xnxx'
3328     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3329     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3330     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3331
3332     def report_webpage(self, video_id):
3333         """Report information extraction"""
3334         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3335
3336     def report_extraction(self, video_id):
3337         """Report information extraction"""
3338         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3339
3340     def _real_extract(self, url):
3341         mobj = re.match(self._VALID_URL, url)
3342         if mobj is None:
3343             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3344             return
3345         video_id = mobj.group(1)
3346
3347         self.report_webpage(video_id)
3348
3349         # Get webpage content
3350         try:
3351             webpage_bytes = compat_urllib_request.urlopen(url).read()
3352             webpage = webpage_bytes.decode('utf-8')
3353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3354             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3355             return
3356
3357         result = re.search(self.VIDEO_URL_RE, webpage)
3358         if result is None:
3359             self._downloader.trouble(u'ERROR: unable to extract video url')
3360             return
3361         video_url = compat_urllib_parse.unquote(result.group(1))
3362
3363         result = re.search(self.VIDEO_TITLE_RE, webpage)
3364         if result is None:
3365             self._downloader.trouble(u'ERROR: unable to extract video title')
3366             return
3367         video_title = result.group(1)
3368
3369         result = re.search(self.VIDEO_THUMB_RE, webpage)
3370         if result is None:
3371             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3372             return
3373         video_thumbnail = result.group(1)
3374
3375         return [{
3376             'id': video_id,
3377             'url': video_url,
3378             'uploader': None,
3379             'upload_date': None,
3380             'title': video_title,
3381             'ext': 'flv',
3382             'thumbnail': video_thumbnail,
3383             'description': None,
3384         }]
3385
3386
3387 class GooglePlusIE(InfoExtractor):
3388     """Information extractor for plus.google.com."""
3389
3390     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3391     IE_NAME = u'plus.google'
3392
3393     def __init__(self, downloader=None):
3394         InfoExtractor.__init__(self, downloader)
3395
3396     def report_extract_entry(self, url):
3397         """Report downloading extry"""
3398         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3399
3400     def report_date(self, upload_date):
3401         """Report downloading extry"""
3402         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3403
3404     def report_uploader(self, uploader):
3405         """Report downloading extry"""
3406         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3407
3408     def report_title(self, video_title):
3409         """Report downloading extry"""
3410         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3411
3412     def report_extract_vid_page(self, video_page):
3413         """Report information extraction."""
3414         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3415
3416     def _real_extract(self, url):
3417         # Extract id from URL
3418         mobj = re.match(self._VALID_URL, url)
3419         if mobj is None:
3420             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3421             return
3422
3423         post_url = mobj.group(0)
3424         video_id = mobj.group(1)
3425
3426         video_extension = 'flv'
3427
3428         # Step 1, Retrieve post webpage to extract further information
3429         self.report_extract_entry(post_url)
3430         request = compat_urllib_request.Request(post_url)
3431         try:
3432             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3433         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3434             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3435             return
3436
3437         # Extract update date
3438         upload_date = None
3439         pattern = 'title="Timestamp">(.*?)</a>'
3440         mobj = re.search(pattern, webpage)
3441         if mobj:
3442             upload_date = mobj.group(1)
3443             # Convert timestring to a format suitable for filename
3444             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3445             upload_date = upload_date.strftime('%Y%m%d')
3446         self.report_date(upload_date)
3447
3448         # Extract uploader
3449         uploader = None
3450         pattern = r'rel\="author".*?>(.*?)</a>'
3451         mobj = re.search(pattern, webpage)
3452         if mobj:
3453             uploader = mobj.group(1)
3454         self.report_uploader(uploader)
3455
3456         # Extract title
3457         # Get the first line for title
3458         video_title = u'NA'
3459         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3460         mobj = re.search(pattern, webpage)
3461         if mobj:
3462             video_title = mobj.group(1)
3463         self.report_title(video_title)
3464
3465         # Step 2, Stimulate clicking the image box to launch video
3466         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3467         mobj = re.search(pattern, webpage)
3468         if mobj is None:
3469             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3470
3471         video_page = mobj.group(1)
3472         request = compat_urllib_request.Request(video_page)
3473         try:
3474             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3475         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3476             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3477             return
3478         self.report_extract_vid_page(video_page)
3479
3480
3481         # Extract video links on video page
3482         """Extract video links of all sizes"""
3483         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3484         mobj = re.findall(pattern, webpage)
3485         if len(mobj) == 0:
3486             self._downloader.trouble(u'ERROR: unable to extract video links')
3487
3488         # Sort in resolution
3489         links = sorted(mobj)
3490
3491         # Choose the lowest of the sort, i.e. highest resolution
3492         video_url = links[-1]
3493         # Only get the url. The resolution part in the tuple has no use anymore
3494         video_url = video_url[-1]
3495         # Treat escaped \u0026 style hex
3496         try:
3497             video_url = video_url.decode("unicode_escape")
3498         except AttributeError: # Python 3
3499             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3500
3501
3502         return [{
3503             'id':       video_id,
3504             'url':      video_url,
3505             'uploader': uploader,
3506             'upload_date':  upload_date,
3507             'title':    video_title,
3508             'ext':      video_extension,
3509         }]
3510
3511 class NBAIE(InfoExtractor):
3512     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3513     IE_NAME = u'nba'
3514
3515     def report_extraction(self, video_id):
3516         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3517
3518     def _real_extract(self, url):
3519         mobj = re.match(self._VALID_URL, url)
3520         if mobj is None:
3521             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3522             return
3523
3524         video_id = mobj.group(1)
3525         if video_id.endswith('/index.html'):
3526             video_id = video_id[:-len('/index.html')]
3527
3528         self.report_extraction(video_id)
3529         try:
3530             urlh = compat_urllib_request.urlopen(url)
3531             webpage_bytes = urlh.read()
3532             webpage = webpage_bytes.decode('utf-8', 'ignore')
3533         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3535             return
3536
3537         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3538         def _findProp(rexp, default=None):
3539             m = re.search(rexp, webpage)
3540             if m:
3541                 return unescapeHTML(m.group(1))
3542             else:
3543                 return default
3544
3545         shortened_video_id = video_id.rpartition('/')[2]
3546         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3547         info = {
3548             'id': shortened_video_id,
3549             'url': video_url,
3550             'ext': 'mp4',
3551             'title': title,
3552             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3553             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3554         }
3555         return [info]
3556
3557 class JustinTVIE(InfoExtractor):
3558     """Information extractor for justin.tv and twitch.tv"""
3559     # TODO: One broadcast may be split into multiple videos. The key
3560     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3561     # starts at 1 and increases. Can we treat all parts as one video?
3562
3563     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3564         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3565     _JUSTIN_PAGE_LIMIT = 100
3566     IE_NAME = u'justin.tv'
3567
3568     def report_extraction(self, file_id):
3569         """Report information extraction."""
3570         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3571
3572     def report_download_page(self, channel, offset):
3573         """Report attempt to download a single page of videos."""
3574         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3575                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3576
3577     # Return count of items, list of *valid* items
3578     def _parse_page(self, url):
3579         try:
3580             urlh = compat_urllib_request.urlopen(url)
3581             webpage_bytes = urlh.read()
3582             webpage = webpage_bytes.decode('utf-8', 'ignore')
3583         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3584             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3585             return
3586
3587         response = json.loads(webpage)
3588         info = []
3589         for clip in response:
3590             video_url = clip['video_file_url']
3591             if video_url:
3592                 video_extension = os.path.splitext(video_url)[1][1:]
3593                 video_date = re.sub('-', '', clip['created_on'][:10])
3594                 info.append({
3595                     'id': clip['id'],
3596                     'url': video_url,
3597                     'title': clip['title'],
3598                     'uploader': clip.get('user_id', clip.get('channel_id')),
3599                     'upload_date': video_date,
3600                     'ext': video_extension,
3601                 })
3602         return (len(response), info)
3603
3604     def _real_extract(self, url):
3605         mobj = re.match(self._VALID_URL, url)
3606         if mobj is None:
3607             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3608             return
3609
3610         api = 'http://api.justin.tv'
3611         video_id = mobj.group(mobj.lastindex)
3612         paged = False
3613         if mobj.lastindex == 1:
3614             paged = True
3615             api += '/channel/archives/%s.json'
3616         else:
3617             api += '/clip/show/%s.json'
3618         api = api % (video_id,)
3619
3620         self.report_extraction(video_id)
3621
3622         info = []
3623         offset = 0
3624         limit = self._JUSTIN_PAGE_LIMIT
3625         while True:
3626             if paged:
3627                 self.report_download_page(video_id, offset)
3628             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3629             page_count, page_info = self._parse_page(page_url)
3630             info.extend(page_info)
3631             if not paged or page_count != limit:
3632                 break
3633             offset += limit
3634         return info