Google Video has been shutdown as of 11/15/2012. All videos on Google Video will...
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21     """Information Extractor class.
22
23     Information extractors are the classes that, given a URL, extract
24     information about the video (or videos) the URL refers to. This
25     information includes the real video URL, the video title, author and
26     others. The information is stored in a dictionary which is then 
27     passed to the FileDownloader. The FileDownloader processes this
28     information possibly downloading the video to the file system, among
29     other possible outcomes.
30
31     The dictionaries must include the following fields:
32
33     id:             Video identifier.
34     url:            Final video URL.
35     uploader:       Nickname of the video uploader, unescaped.
36     upload_date:    Video upload date (YYYYMMDD).
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     player_url:     SWF Player URL (used for rtmpdump).
46     subtitles:      The .srt file contents.
47     urlhandle:      [internal] The urlHandle to be used to download the file,
48                     like returned by urllib.request.urlopen
49
50     The fields should all be Unicode strings.
51
52     Subclasses of this one should re-define the _real_initialize() and
53     _real_extract() methods and define a _VALID_URL regexp.
54     Probably, they should also be added to the list of extractors.
55
56     _real_extract() must return a *list* of information dictionaries as
57     described above.
58
59     Finally, the _WORKING attribute should be set to False for broken IEs
60     in order to warn the users and skip the tests.
61     """
62
63     _ready = False
64     _downloader = None
65     _WORKING = True
66
67     def __init__(self, downloader=None):
68         """Constructor. Receives an optional downloader."""
69         self._ready = False
70         self.set_downloader(downloader)
71
72     def suitable(self, url):
73         """Receives a URL and returns True if suitable for this IE."""
74         return re.match(self._VALID_URL, url) is not None
75
76     def working(self):
77         """Getter method for _WORKING."""
78         return self._WORKING
79
80     def initialize(self):
81         """Initializes an instance (authentication, etc)."""
82         if not self._ready:
83             self._real_initialize()
84             self._ready = True
85
86     def extract(self, url):
87         """Extracts URL information and returns it in list of dicts."""
88         self.initialize()
89         return self._real_extract(url)
90
91     def set_downloader(self, downloader):
92         """Sets the downloader for this IE."""
93         self._downloader = downloader
94
95     def _real_initialize(self):
96         """Real initialization process. Redefine in subclasses."""
97         pass
98
99     def _real_extract(self, url):
100         """Real extraction process. Redefine in subclasses."""
101         pass
102
103
104 class YoutubeIE(InfoExtractor):
105     """Information extractor for youtube.com."""
106
107     _VALID_URL = r"""^
108                      (
109                          (?:https?://)?                                       # http(s):// (optional)
110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
114                          (?:                                                  # the various things that can precede the ID:
115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
116                              |(?:                                             # or the v= param in all its forms
117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
120                                  v=
121                              )
122                          )?                                                   # optional -> youtube.com/xxxx is OK
123                      )?                                                       # all until now is optional -> you can pass the naked ID
124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
125                      (?(1).+)?                                                # if we found the ID, everything can follow
126                      $"""
127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131     _NETRC_MACHINE = 'youtube'
132     # Listed in order of quality
133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135     _video_extensions = {
136         '13': '3gp',
137         '17': 'mp4',
138         '18': 'mp4',
139         '22': 'mp4',
140         '37': 'mp4',
141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142         '43': 'webm',
143         '44': 'webm',
144         '45': 'webm',
145         '46': 'webm',
146     }
147     _video_dimensions = {
148         '5': '240x400',
149         '6': '???',
150         '13': '???',
151         '17': '144x176',
152         '18': '360x640',
153         '22': '720x1280',
154         '34': '360x640',
155         '35': '480x854',
156         '37': '1080x1920',
157         '38': '3072x4096',
158         '43': '360x640',
159         '44': '480x854',
160         '45': '720x1280',
161         '46': '1080x1920',
162     }   
163     IE_NAME = u'youtube'
164
165     def suitable(self, url):
166         """Receives a URL and returns True if suitable for this IE."""
167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169     def report_lang(self):
170         """Report attempt to set language."""
171         self._downloader.to_screen(u'[youtube] Setting language')
172
173     def report_login(self):
174         """Report attempt to log in."""
175         self._downloader.to_screen(u'[youtube] Logging in')
176
177     def report_age_confirmation(self):
178         """Report attempt to confirm age."""
179         self._downloader.to_screen(u'[youtube] Confirming age')
180
181     def report_video_webpage_download(self, video_id):
182         """Report attempt to download video webpage."""
183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185     def report_video_info_webpage_download(self, video_id):
186         """Report attempt to download video info webpage."""
187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189     def report_video_subtitles_download(self, video_id):
190         """Report attempt to download video info webpage."""
191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193     def report_information_extraction(self, video_id):
194         """Report attempt to extract video information."""
195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197     def report_unavailable_format(self, video_id, format):
198         """Report extracted video URL."""
199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201     def report_rtmp_download(self):
202         """Indicate the download will use the RTMP protocol."""
203         self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205     def _closed_captions_xml_to_srt(self, xml_string):
206         srt = ''
207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208         # TODO parse xml instead of regex
209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
210             if not dur: dur = '4'
211             start = float(start)
212             end = start + float(dur)
213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215             caption = unescapeHTML(caption)
216             caption = unescapeHTML(caption) # double cycle, intentional
217             srt += str(n+1) + '\n'
218             srt += start + ' --> ' + end + '\n'
219             srt += caption + '\n\n'
220         return srt
221
222     def _print_formats(self, formats):
223         print('Available formats:')
224         for x in formats:
225             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227     def _real_initialize(self):
228         if self._downloader is None:
229             return
230
231         username = None
232         password = None
233         downloader_params = self._downloader.params
234
235         # Attempt to use provided username and password or .netrc data
236         if downloader_params.get('username', None) is not None:
237             username = downloader_params['username']
238             password = downloader_params['password']
239         elif downloader_params.get('usenetrc', False):
240             try:
241                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242                 if info is not None:
243                     username = info[0]
244                     password = info[2]
245                 else:
246                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247             except (IOError, netrc.NetrcParseError) as err:
248                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249                 return
250
251         # Set language
252         request = compat_urllib_request.Request(self._LANG_URL)
253         try:
254             self.report_lang()
255             compat_urllib_request.urlopen(request).read()
256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258             return
259
260         # No authentication to be performed
261         if username is None:
262             return
263
264         # Log in
265         login_form = {
266                 'current_form': 'loginForm',
267                 'next':     '/',
268                 'action_login': 'Log In',
269                 'username': username,
270                 'password': password,
271                 }
272         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273         try:
274             self.report_login()
275             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
276             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278                 return
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281             return
282
283         # Confirm age
284         age_form = {
285                 'next_url':     '/',
286                 'action_confirm':   'Confirm',
287                 }
288         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289         try:
290             self.report_age_confirmation()
291             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294             return
295
296     def _real_extract(self, url):
297         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298         mobj = re.search(self._NEXT_URL_RE, url)
299         if mobj:
300             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302         # Extract video id from URL
303         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304         if mobj is None:
305             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306             return
307         video_id = mobj.group(2)
308
309         # Get video webpage
310         self.report_video_webpage_download(video_id)
311         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312         try:
313             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316             return
317
318         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320         # Attempt to extract SWF player URL
321         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322         if mobj is not None:
323             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324         else:
325             player_url = None
326
327         # Get video info
328         self.report_video_info_webpage_download(video_id)
329         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331                     % (video_id, el_type))
332             request = compat_urllib_request.Request(video_info_url)
333             try:
334                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336                 video_info = compat_parse_qs(video_info_webpage)
337                 if 'token' in video_info:
338                     break
339             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341                 return
342         if 'token' not in video_info:
343             if 'reason' in video_info:
344                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345             else:
346                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347             return
348
349         # Check for "rental" videos
350         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351             self._downloader.trouble(u'ERROR: "rental" videos not supported')
352             return
353
354         # Start extracting information
355         self.report_information_extraction(video_id)
356
357         # uploader
358         if 'author' not in video_info:
359             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360             return
361         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363         # title
364         if 'title' not in video_info:
365             self._downloader.trouble(u'ERROR: unable to extract video title')
366             return
367         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369         # thumbnail image
370         if 'thumbnail_url' not in video_info:
371             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372             video_thumbnail = ''
373         else:   # don't panic if we can't find it
374             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376         # upload date
377         upload_date = None
378         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379         if mobj is not None:
380             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382             for expression in format_expressions:
383                 try:
384                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385                 except:
386                     pass
387
388         # description
389         video_description = get_element_by_id("eow-description", video_webpage)
390         if video_description:
391             video_description = clean_html(video_description)
392         else:
393             video_description = ''
394
395         # closed captions
396         video_subtitles = None
397         if self._downloader.params.get('writesubtitles', False):
398             try:
399                 self.report_video_subtitles_download(video_id)
400                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401                 try:
402                     srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
403                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407                 if not srt_lang_list:
408                     raise Trouble(u'WARNING: video has no closed captions')
409                 if self._downloader.params.get('subtitleslang', False):
410                     srt_lang = self._downloader.params.get('subtitleslang')
411                 elif 'en' in srt_lang_list:
412                     srt_lang = 'en'
413                 else:
414                     srt_lang = srt_lang_list.keys()[0]
415                 if not srt_lang in srt_lang_list:
416                     raise Trouble(u'WARNING: no closed captions found in the specified language')
417                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418                 try:
419                     srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
420                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422                 if not srt_xml:
423                     raise Trouble(u'WARNING: unable to download video subtitles')
424                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
425             except Trouble as trouble:
426                 self._downloader.trouble(str(trouble))
427
428         if 'length_seconds' not in video_info:
429             self._downloader.trouble(u'WARNING: unable to extract video duration')
430             video_duration = ''
431         else:
432             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434         # token
435         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437         # Decide which formats to download
438         req_format = self._downloader.params.get('format', None)
439
440         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441             self.report_rtmp_download()
442             video_url_list = [(None, video_info['conn'][0])]
443         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449             format_limit = self._downloader.params.get('format_limit', None)
450             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451             if format_limit is not None and format_limit in available_formats:
452                 format_list = available_formats[available_formats.index(format_limit):]
453             else:
454                 format_list = available_formats
455             existing_formats = [x for x in format_list if x in url_map]
456             if len(existing_formats) == 0:
457                 self._downloader.trouble(u'ERROR: no known formats available for video')
458                 return
459             if self._downloader.params.get('listformats', None):
460                 self._print_formats(existing_formats)
461                 return
462             if req_format is None or req_format == 'best':
463                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464             elif req_format == 'worst':
465                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466             elif req_format in ('-1', 'all'):
467                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468             else:
469                 # Specific formats. We pick the first in a slash-delimeted sequence.
470                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471                 req_formats = req_format.split('/')
472                 video_url_list = None
473                 for rf in req_formats:
474                     if rf in url_map:
475                         video_url_list = [(rf, url_map[rf])]
476                         break
477                 if video_url_list is None:
478                     self._downloader.trouble(u'ERROR: requested format not available')
479                     return
480         else:
481             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482             return
483
484         results = []
485         for format_param, video_real_url in video_url_list:
486             # Extension
487             video_extension = self._video_extensions.get(format_param, 'flv')
488
489             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490                                               self._video_dimensions.get(format_param, '???'))
491
492             results.append({
493                 'id':       video_id,
494                 'url':      video_real_url,
495                 'uploader': video_uploader,
496                 'upload_date':  upload_date,
497                 'title':    video_title,
498                 'ext':      video_extension,
499                 'format':   video_format,
500                 'thumbnail':    video_thumbnail,
501                 'description':  video_description,
502                 'player_url':   player_url,
503                 'subtitles':    video_subtitles,
504                 'duration':     video_duration
505             })
506         return results
507
508
509 class MetacafeIE(InfoExtractor):
510     """Information Extractor for metacafe.com."""
511
512     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515     IE_NAME = u'metacafe'
516
517     def __init__(self, downloader=None):
518         InfoExtractor.__init__(self, downloader)
519
520     def report_disclaimer(self):
521         """Report disclaimer retrieval."""
522         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524     def report_age_confirmation(self):
525         """Report attempt to confirm age."""
526         self._downloader.to_screen(u'[metacafe] Confirming age')
527
528     def report_download_webpage(self, video_id):
529         """Report webpage download."""
530         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532     def report_extraction(self, video_id):
533         """Report information extraction."""
534         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536     def _real_initialize(self):
537         # Retrieve disclaimer
538         request = compat_urllib_request.Request(self._DISCLAIMER)
539         try:
540             self.report_disclaimer()
541             disclaimer = compat_urllib_request.urlopen(request).read()
542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544             return
545
546         # Confirm age
547         disclaimer_form = {
548             'filters': '0',
549             'submit': "Continue - I'm over 18",
550             }
551         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552         try:
553             self.report_age_confirmation()
554             disclaimer = compat_urllib_request.urlopen(request).read()
555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557             return
558
559     def _real_extract(self, url):
560         # Extract id and simplified title from URL
561         mobj = re.match(self._VALID_URL, url)
562         if mobj is None:
563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564             return
565
566         video_id = mobj.group(1)
567
568         # Check if video comes from YouTube
569         mobj2 = re.match(r'^yt-(.*)$', video_id)
570         if mobj2 is not None:
571             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572             return
573
574         # Retrieve video webpage to extract further information
575         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576         try:
577             self.report_download_webpage(video_id)
578             webpage = compat_urllib_request.urlopen(request).read()
579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581             return
582
583         # Extract URL, uploader and title from webpage
584         self.report_extraction(video_id)
585         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586         if mobj is not None:
587             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588             video_extension = mediaURL[-3:]
589
590             # Extract gdaKey if available
591             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592             if mobj is None:
593                 video_url = mediaURL
594             else:
595                 gdaKey = mobj.group(1)
596                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597         else:
598             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599             if mobj is None:
600                 self._downloader.trouble(u'ERROR: unable to extract media URL')
601                 return
602             vardict = compat_parse_qs(mobj.group(1))
603             if 'mediaData' not in vardict:
604                 self._downloader.trouble(u'ERROR: unable to extract media URL')
605                 return
606             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607             if mobj is None:
608                 self._downloader.trouble(u'ERROR: unable to extract media URL')
609                 return
610             mediaURL = mobj.group(1).replace('\\/', '/')
611             video_extension = mediaURL[-3:]
612             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615         if mobj is None:
616             self._downloader.trouble(u'ERROR: unable to extract title')
617             return
618         video_title = mobj.group(1).decode('utf-8')
619
620         mobj = re.search(r'submitter=(.*?);', webpage)
621         if mobj is None:
622             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623             return
624         video_uploader = mobj.group(1)
625
626         return [{
627             'id':       video_id.decode('utf-8'),
628             'url':      video_url.decode('utf-8'),
629             'uploader': video_uploader.decode('utf-8'),
630             'upload_date':  None,
631             'title':    video_title,
632             'ext':      video_extension.decode('utf-8'),
633         }]
634
635
636 class DailymotionIE(InfoExtractor):
637     """Information Extractor for Dailymotion"""
638
639     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640     IE_NAME = u'dailymotion'
641
642     def __init__(self, downloader=None):
643         InfoExtractor.__init__(self, downloader)
644
645     def report_download_webpage(self, video_id):
646         """Report webpage download."""
647         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649     def report_extraction(self, video_id):
650         """Report information extraction."""
651         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653     def _real_extract(self, url):
654         # Extract id and simplified title from URL
655         mobj = re.match(self._VALID_URL, url)
656         if mobj is None:
657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658             return
659
660         video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662         video_extension = 'mp4'
663
664         # Retrieve video webpage to extract further information
665         request = compat_urllib_request.Request(url)
666         request.add_header('Cookie', 'family_filter=off')
667         try:
668             self.report_download_webpage(video_id)
669             webpage_bytes = compat_urllib_request.urlopen(request).read()
670             webpage = webpage_bytes.decode('utf-8')
671         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
673             return
674
675         # Extract URL, uploader and title from webpage
676         self.report_extraction(video_id)
677         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
678         if mobj is None:
679             self._downloader.trouble(u'ERROR: unable to extract media URL')
680             return
681         flashvars = compat_urllib_parse.unquote(mobj.group(1))
682
683         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
684             if key in flashvars:
685                 max_quality = key
686                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
687                 break
688         else:
689             self._downloader.trouble(u'ERROR: unable to extract video URL')
690             return
691
692         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
693         if mobj is None:
694             self._downloader.trouble(u'ERROR: unable to extract video URL')
695             return
696
697         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
698
699         # TODO: support choosing qualities
700
701         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
702         if mobj is None:
703             self._downloader.trouble(u'ERROR: unable to extract title')
704             return
705         video_title = unescapeHTML(mobj.group('title'))
706
707         video_uploader = None
708         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
709         if mobj is None:
710             # lookin for official user
711             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712             if mobj_official is None:
713                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
714             else:
715                 video_uploader = mobj_official.group(1)
716         else:
717             video_uploader = mobj.group(1)
718
719         video_upload_date = None
720         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
721         if mobj is not None:
722             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
723
724         return [{
725             'id':       video_id,
726             'url':      video_url,
727             'uploader': video_uploader,
728             'upload_date':  video_upload_date,
729             'title':    video_title,
730             'ext':      video_extension,
731         }]
732
733
734 class PhotobucketIE(InfoExtractor):
735     """Information extractor for photobucket.com."""
736
737     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
738     IE_NAME = u'photobucket'
739
740     def __init__(self, downloader=None):
741         InfoExtractor.__init__(self, downloader)
742
743     def report_download_webpage(self, video_id):
744         """Report webpage download."""
745         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
746
747     def report_extraction(self, video_id):
748         """Report information extraction."""
749         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
750
751     def _real_extract(self, url):
752         # Extract id from URL
753         mobj = re.match(self._VALID_URL, url)
754         if mobj is None:
755             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
756             return
757
758         video_id = mobj.group(1)
759
760         video_extension = 'flv'
761
762         # Retrieve video webpage to extract further information
763         request = compat_urllib_request.Request(url)
764         try:
765             self.report_download_webpage(video_id)
766             webpage = compat_urllib_request.urlopen(request).read()
767         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
769             return
770
771         # Extract URL, uploader, and title from webpage
772         self.report_extraction(video_id)
773         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
774         if mobj is None:
775             self._downloader.trouble(u'ERROR: unable to extract media URL')
776             return
777         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
778
779         video_url = mediaURL
780
781         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
782         if mobj is None:
783             self._downloader.trouble(u'ERROR: unable to extract title')
784             return
785         video_title = mobj.group(1).decode('utf-8')
786
787         video_uploader = mobj.group(2).decode('utf-8')
788
789         return [{
790             'id':       video_id.decode('utf-8'),
791             'url':      video_url.decode('utf-8'),
792             'uploader': video_uploader,
793             'upload_date':  None,
794             'title':    video_title,
795             'ext':      video_extension.decode('utf-8'),
796         }]
797
798
799 class YahooIE(InfoExtractor):
800     """Information extractor for video.yahoo.com."""
801
802     # _VALID_URL matches all Yahoo! Video URLs
803     # _VPAGE_URL matches only the extractable '/watch/' URLs
804     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
805     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
806     IE_NAME = u'video.yahoo'
807
808     def __init__(self, downloader=None):
809         InfoExtractor.__init__(self, downloader)
810
811     def report_download_webpage(self, video_id):
812         """Report webpage download."""
813         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
814
815     def report_extraction(self, video_id):
816         """Report information extraction."""
817         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
818
819     def _real_extract(self, url, new_video=True):
820         # Extract ID from URL
821         mobj = re.match(self._VALID_URL, url)
822         if mobj is None:
823             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
824             return
825
826         video_id = mobj.group(2)
827         video_extension = 'flv'
828
829         # Rewrite valid but non-extractable URLs as
830         # extractable English language /watch/ URLs
831         if re.match(self._VPAGE_URL, url) is None:
832             request = compat_urllib_request.Request(url)
833             try:
834                 webpage = compat_urllib_request.urlopen(request).read()
835             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
836                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
837                 return
838
839             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
840             if mobj is None:
841                 self._downloader.trouble(u'ERROR: Unable to extract id field')
842                 return
843             yahoo_id = mobj.group(1)
844
845             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
846             if mobj is None:
847                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
848                 return
849             yahoo_vid = mobj.group(1)
850
851             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
852             return self._real_extract(url, new_video=False)
853
854         # Retrieve video webpage to extract further information
855         request = compat_urllib_request.Request(url)
856         try:
857             self.report_download_webpage(video_id)
858             webpage = compat_urllib_request.urlopen(request).read()
859         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861             return
862
863         # Extract uploader and title from webpage
864         self.report_extraction(video_id)
865         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
866         if mobj is None:
867             self._downloader.trouble(u'ERROR: unable to extract video title')
868             return
869         video_title = mobj.group(1).decode('utf-8')
870
871         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
872         if mobj is None:
873             self._downloader.trouble(u'ERROR: unable to extract video uploader')
874             return
875         video_uploader = mobj.group(1).decode('utf-8')
876
877         # Extract video thumbnail
878         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
879         if mobj is None:
880             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
881             return
882         video_thumbnail = mobj.group(1).decode('utf-8')
883
884         # Extract video description
885         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
886         if mobj is None:
887             self._downloader.trouble(u'ERROR: unable to extract video description')
888             return
889         video_description = mobj.group(1).decode('utf-8')
890         if not video_description:
891             video_description = 'No description available.'
892
893         # Extract video height and width
894         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
895         if mobj is None:
896             self._downloader.trouble(u'ERROR: unable to extract video height')
897             return
898         yv_video_height = mobj.group(1)
899
900         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
901         if mobj is None:
902             self._downloader.trouble(u'ERROR: unable to extract video width')
903             return
904         yv_video_width = mobj.group(1)
905
906         # Retrieve video playlist to extract media URL
907         # I'm not completely sure what all these options are, but we
908         # seem to need most of them, otherwise the server sends a 401.
909         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
910         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
911         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
912                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
913                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
914         try:
915             self.report_download_webpage(video_id)
916             webpage = compat_urllib_request.urlopen(request).read()
917         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
919             return
920
921         # Extract media URL from playlist XML
922         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
923         if mobj is None:
924             self._downloader.trouble(u'ERROR: Unable to extract media URL')
925             return
926         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
927         video_url = unescapeHTML(video_url)
928
929         return [{
930             'id':       video_id.decode('utf-8'),
931             'url':      video_url,
932             'uploader': video_uploader,
933             'upload_date':  None,
934             'title':    video_title,
935             'ext':      video_extension.decode('utf-8'),
936             'thumbnail':    video_thumbnail.decode('utf-8'),
937             'description':  video_description,
938         }]
939
940
941 class VimeoIE(InfoExtractor):
942     """Information extractor for vimeo.com."""
943
944     # _VALID_URL matches Vimeo URLs
945     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
946     IE_NAME = u'vimeo'
947
948     def __init__(self, downloader=None):
949         InfoExtractor.__init__(self, downloader)
950
951     def report_download_webpage(self, video_id):
952         """Report webpage download."""
953         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
954
955     def report_extraction(self, video_id):
956         """Report information extraction."""
957         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
958
959     def _real_extract(self, url, new_video=True):
960         # Extract ID from URL
961         mobj = re.match(self._VALID_URL, url)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
964             return
965
966         video_id = mobj.group(1)
967
968         # Retrieve video webpage to extract further information
969         request = compat_urllib_request.Request(url, None, std_headers)
970         try:
971             self.report_download_webpage(video_id)
972             webpage_bytes = compat_urllib_request.urlopen(request).read()
973             webpage = webpage_bytes.decode('utf-8')
974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
976             return
977
978         # Now we begin extracting as much information as we can from what we
979         # retrieved. First we extract the information common to all extractors,
980         # and latter we extract those that are Vimeo specific.
981         self.report_extraction(video_id)
982
983         # Extract the config JSON
984         try:
985             config = webpage.split(' = {config:')[1].split(',assets:')[0]
986             config = json.loads(config)
987         except:
988             self._downloader.trouble(u'ERROR: unable to extract info section')
989             return
990         
991         # Extract title
992         video_title = config["video"]["title"]
993
994         # Extract uploader
995         video_uploader = config["video"]["owner"]["name"]
996
997         # Extract video thumbnail
998         video_thumbnail = config["video"]["thumbnail"]
999
1000         # Extract video description
1001         video_description = get_element_by_id("description", webpage)
1002         if video_description: video_description = clean_html(video_description)
1003         else: video_description = ''
1004
1005         # Extract upload date
1006         video_upload_date = None
1007         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1008         if mobj is not None:
1009             video_upload_date = mobj.group(1)
1010
1011         # Vimeo specific: extract request signature and timestamp
1012         sig = config['request']['signature']
1013         timestamp = config['request']['timestamp']
1014
1015         # Vimeo specific: extract video codec and quality information
1016         # First consider quality, then codecs, then take everything
1017         # TODO bind to format param
1018         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1019         files = { 'hd': [], 'sd': [], 'other': []}
1020         for codec_name, codec_extension in codecs:
1021             if codec_name in config["video"]["files"]:
1022                 if 'hd' in config["video"]["files"][codec_name]:
1023                     files['hd'].append((codec_name, codec_extension, 'hd'))
1024                 elif 'sd' in config["video"]["files"][codec_name]:
1025                     files['sd'].append((codec_name, codec_extension, 'sd'))
1026                 else:
1027                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1028
1029         for quality in ('hd', 'sd', 'other'):
1030             if len(files[quality]) > 0:
1031                 video_quality = files[quality][0][2]
1032                 video_codec = files[quality][0][0]
1033                 video_extension = files[quality][0][1]
1034                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1035                 break
1036         else:
1037             self._downloader.trouble(u'ERROR: no known codec found')
1038             return
1039
1040         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1041                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1042
1043         return [{
1044             'id':       video_id,
1045             'url':      video_url,
1046             'uploader': video_uploader,
1047             'upload_date':  video_upload_date,
1048             'title':    video_title,
1049             'ext':      video_extension,
1050             'thumbnail':    video_thumbnail,
1051             'description':  video_description,
1052         }]
1053
1054
1055 class ArteTvIE(InfoExtractor):
1056     """arte.tv information extractor."""
1057
1058     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1059     _LIVE_URL = r'index-[0-9]+\.html$'
1060
1061     IE_NAME = u'arte.tv'
1062
1063     def __init__(self, downloader=None):
1064         InfoExtractor.__init__(self, downloader)
1065
1066     def report_download_webpage(self, video_id):
1067         """Report webpage download."""
1068         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1069
1070     def report_extraction(self, video_id):
1071         """Report information extraction."""
1072         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1073
1074     def fetch_webpage(self, url):
1075         self._downloader.increment_downloads()
1076         request = compat_urllib_request.Request(url)
1077         try:
1078             self.report_download_webpage(url)
1079             webpage = compat_urllib_request.urlopen(request).read()
1080         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1081             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1082             return
1083         except ValueError as err:
1084             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1085             return
1086         return webpage
1087
1088     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1089         page = self.fetch_webpage(url)
1090         mobj = re.search(regex, page, regexFlags)
1091         info = {}
1092
1093         if mobj is None:
1094             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1095             return
1096
1097         for (i, key, err) in matchTuples:
1098             if mobj.group(i) is None:
1099                 self._downloader.trouble(err)
1100                 return
1101             else:
1102                 info[key] = mobj.group(i)
1103
1104         return info
1105
1106     def extractLiveStream(self, url):
1107         video_lang = url.split('/')[-4]
1108         info = self.grep_webpage(
1109             url,
1110             r'src="(.*?/videothek_js.*?\.js)',
1111             0,
1112             [
1113                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1114             ]
1115         )
1116         http_host = url.split('/')[2]
1117         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1118         info = self.grep_webpage(
1119             next_url,
1120             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1121                 '(http://.*?\.swf).*?' +
1122                 '(rtmp://.*?)\'',
1123             re.DOTALL,
1124             [
1125                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1126                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1127                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1128             ]
1129         )
1130         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1131
1132     def extractPlus7Stream(self, url):
1133         video_lang = url.split('/')[-3]
1134         info = self.grep_webpage(
1135             url,
1136             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1137             0,
1138             [
1139                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1140             ]
1141         )
1142         next_url = compat_urllib_parse.unquote(info.get('url'))
1143         info = self.grep_webpage(
1144             next_url,
1145             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1146             0,
1147             [
1148                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1149             ]
1150         )
1151         next_url = compat_urllib_parse.unquote(info.get('url'))
1152
1153         info = self.grep_webpage(
1154             next_url,
1155             r'<video id="(.*?)".*?>.*?' +
1156                 '<name>(.*?)</name>.*?' +
1157                 '<dateVideo>(.*?)</dateVideo>.*?' +
1158                 '<url quality="hd">(.*?)</url>',
1159             re.DOTALL,
1160             [
1161                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1162                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1163                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1164                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1165             ]
1166         )
1167
1168         return {
1169             'id':           info.get('id'),
1170             'url':          compat_urllib_parse.unquote(info.get('url')),
1171             'uploader':     u'arte.tv',
1172             'upload_date':  info.get('date'),
1173             'title':        info.get('title'),
1174             'ext':          u'mp4',
1175             'format':       u'NA',
1176             'player_url':   None,
1177         }
1178
1179     def _real_extract(self, url):
1180         video_id = url.split('/')[-1]
1181         self.report_extraction(video_id)
1182
1183         if re.search(self._LIVE_URL, video_id) is not None:
1184             self.extractLiveStream(url)
1185             return
1186         else:
1187             info = self.extractPlus7Stream(url)
1188
1189         return [info]
1190
1191
1192 class GenericIE(InfoExtractor):
1193     """Generic last-resort information extractor."""
1194
1195     _VALID_URL = r'.*'
1196     IE_NAME = u'generic'
1197
1198     def __init__(self, downloader=None):
1199         InfoExtractor.__init__(self, downloader)
1200
1201     def report_download_webpage(self, video_id):
1202         """Report webpage download."""
1203         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1204         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1205
1206     def report_extraction(self, video_id):
1207         """Report information extraction."""
1208         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1209
1210     def report_following_redirect(self, new_url):
1211         """Report information extraction."""
1212         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1213         
1214     def _test_redirect(self, url):
1215         """Check if it is a redirect, like url shorteners, in case restart chain."""
1216         class HeadRequest(compat_urllib_request.Request):
1217             def get_method(self):
1218                 return "HEAD"
1219
1220         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1221             """
1222             Subclass the HTTPRedirectHandler to make it use our 
1223             HeadRequest also on the redirected URL
1224             """
1225             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1226                 if code in (301, 302, 303, 307):
1227                     newurl = newurl.replace(' ', '%20') 
1228                     newheaders = dict((k,v) for k,v in req.headers.items()
1229                                       if k.lower() not in ("content-length", "content-type"))
1230                     return HeadRequest(newurl, 
1231                                        headers=newheaders,
1232                                        origin_req_host=req.get_origin_req_host(), 
1233                                        unverifiable=True) 
1234                 else: 
1235                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1236
1237         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1238             """
1239             Fallback to GET if HEAD is not allowed (405 HTTP error)
1240             """
1241             def http_error_405(self, req, fp, code, msg, headers): 
1242                 fp.read()
1243                 fp.close()
1244
1245                 newheaders = dict((k,v) for k,v in req.headers.items()
1246                                   if k.lower() not in ("content-length", "content-type"))
1247                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1248                                                  headers=newheaders, 
1249                                                  origin_req_host=req.get_origin_req_host(), 
1250                                                  unverifiable=True))
1251
1252         # Build our opener
1253         opener = compat_urllib_request.OpenerDirector() 
1254         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1255                         HTTPMethodFallback, HEADRedirectHandler,
1256                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1257             opener.add_handler(handler())
1258
1259         response = opener.open(HeadRequest(url))
1260         new_url = response.geturl()
1261
1262         if url == new_url:
1263             return False
1264
1265         self.report_following_redirect(new_url)
1266         self._downloader.download([new_url])
1267         return True
1268
1269     def _real_extract(self, url):
1270         if self._test_redirect(url): return
1271
1272         video_id = url.split('/')[-1]
1273         request = compat_urllib_request.Request(url)
1274         try:
1275             self.report_download_webpage(video_id)
1276             webpage = compat_urllib_request.urlopen(request).read()
1277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1278             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1279             return
1280         except ValueError as err:
1281             # since this is the last-resort InfoExtractor, if
1282             # this error is thrown, it'll be thrown here
1283             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1284             return
1285
1286         self.report_extraction(video_id)
1287         # Start with something easy: JW Player in SWFObject
1288         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1289         if mobj is None:
1290             # Broaden the search a little bit
1291             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1292         if mobj is None:
1293             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1294             return
1295
1296         # It's possible that one of the regexes
1297         # matched, but returned an empty group:
1298         if mobj.group(1) is None:
1299             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1300             return
1301
1302         video_url = compat_urllib_parse.unquote(mobj.group(1))
1303         video_id = os.path.basename(video_url)
1304
1305         # here's a fun little line of code for you:
1306         video_extension = os.path.splitext(video_id)[1][1:]
1307         video_id = os.path.splitext(video_id)[0]
1308
1309         # it's tempting to parse this further, but you would
1310         # have to take into account all the variations like
1311         #   Video Title - Site Name
1312         #   Site Name | Video Title
1313         #   Video Title - Tagline | Site Name
1314         # and so on and so forth; it's just not practical
1315         mobj = re.search(r'<title>(.*)</title>', webpage)
1316         if mobj is None:
1317             self._downloader.trouble(u'ERROR: unable to extract title')
1318             return
1319         video_title = mobj.group(1)
1320
1321         # video uploader is domain name
1322         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1323         if mobj is None:
1324             self._downloader.trouble(u'ERROR: unable to extract title')
1325             return
1326         video_uploader = mobj.group(1)
1327
1328         return [{
1329             'id':       video_id,
1330             'url':      video_url,
1331             'uploader': video_uploader,
1332             'upload_date':  None,
1333             'title':    video_title,
1334             'ext':      video_extension,
1335         }]
1336
1337
1338 class YoutubeSearchIE(InfoExtractor):
1339     """Information Extractor for YouTube search queries."""
1340     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1341     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1342     _max_youtube_results = 1000
1343     IE_NAME = u'youtube:search'
1344
1345     def __init__(self, downloader=None):
1346         InfoExtractor.__init__(self, downloader)
1347
1348     def report_download_page(self, query, pagenum):
1349         """Report attempt to download search page with given number."""
1350         query = query.decode(preferredencoding())
1351         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1352
1353     def _real_extract(self, query):
1354         mobj = re.match(self._VALID_URL, query)
1355         if mobj is None:
1356             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1357             return
1358
1359         prefix, query = query.split(':')
1360         prefix = prefix[8:]
1361         query = query.encode('utf-8')
1362         if prefix == '':
1363             self._download_n_results(query, 1)
1364             return
1365         elif prefix == 'all':
1366             self._download_n_results(query, self._max_youtube_results)
1367             return
1368         else:
1369             try:
1370                 n = int(prefix)
1371                 if n <= 0:
1372                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1373                     return
1374                 elif n > self._max_youtube_results:
1375                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1376                     n = self._max_youtube_results
1377                 self._download_n_results(query, n)
1378                 return
1379             except ValueError: # parsing prefix as integer fails
1380                 self._download_n_results(query, 1)
1381                 return
1382
1383     def _download_n_results(self, query, n):
1384         """Downloads a specified number of results for a query"""
1385
1386         video_ids = []
1387         pagenum = 0
1388         limit = n
1389
1390         while (50 * pagenum) < limit:
1391             self.report_download_page(query, pagenum+1)
1392             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1393             request = compat_urllib_request.Request(result_url)
1394             try:
1395                 data = compat_urllib_request.urlopen(request).read()
1396             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1397                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1398                 return
1399             api_response = json.loads(data)['data']
1400
1401             new_ids = list(video['id'] for video in api_response['items'])
1402             video_ids += new_ids
1403
1404             limit = min(n, api_response['totalItems'])
1405             pagenum += 1
1406
1407         if len(video_ids) > n:
1408             video_ids = video_ids[:n]
1409         for id in video_ids:
1410             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1411         return
1412
1413
1414 class GoogleSearchIE(InfoExtractor):
1415     """Information Extractor for Google Video search queries."""
1416     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1417     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1418     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1419     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1420     _max_google_results = 1000
1421     IE_NAME = u'video.google:search'
1422
1423     def __init__(self, downloader=None):
1424         InfoExtractor.__init__(self, downloader)
1425
1426     def report_download_page(self, query, pagenum):
1427         """Report attempt to download playlist page with given number."""
1428         query = query.decode(preferredencoding())
1429         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1430
1431     def _real_extract(self, query):
1432         mobj = re.match(self._VALID_URL, query)
1433         if mobj is None:
1434             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1435             return
1436
1437         prefix, query = query.split(':')
1438         prefix = prefix[8:]
1439         query = query.encode('utf-8')
1440         if prefix == '':
1441             self._download_n_results(query, 1)
1442             return
1443         elif prefix == 'all':
1444             self._download_n_results(query, self._max_google_results)
1445             return
1446         else:
1447             try:
1448                 n = int(prefix)
1449                 if n <= 0:
1450                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1451                     return
1452                 elif n > self._max_google_results:
1453                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1454                     n = self._max_google_results
1455                 self._download_n_results(query, n)
1456                 return
1457             except ValueError: # parsing prefix as integer fails
1458                 self._download_n_results(query, 1)
1459                 return
1460
1461     def _download_n_results(self, query, n):
1462         """Downloads a specified number of results for a query"""
1463
1464         video_ids = []
1465         pagenum = 0
1466
1467         while True:
1468             self.report_download_page(query, pagenum)
1469             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1470             request = compat_urllib_request.Request(result_url)
1471             try:
1472                 page = compat_urllib_request.urlopen(request).read()
1473             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1474                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1475                 return
1476
1477             # Extract video identifiers
1478             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1479                 video_id = mobj.group(1)
1480                 if video_id not in video_ids:
1481                     video_ids.append(video_id)
1482                     if len(video_ids) == n:
1483                         # Specified n videos reached
1484                         for id in video_ids:
1485                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1486                         return
1487
1488             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1489                 for id in video_ids:
1490                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1491                 return
1492
1493             pagenum = pagenum + 1
1494
1495
1496 class YahooSearchIE(InfoExtractor):
1497     """Information Extractor for Yahoo! Video search queries."""
1498     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1499     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1500     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1501     _MORE_PAGES_INDICATOR = r'\s*Next'
1502     _max_yahoo_results = 1000
1503     IE_NAME = u'video.yahoo:search'
1504
1505     def __init__(self, downloader=None):
1506         InfoExtractor.__init__(self, downloader)
1507
1508     def report_download_page(self, query, pagenum):
1509         """Report attempt to download playlist page with given number."""
1510         query = query.decode(preferredencoding())
1511         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1512
1513     def _real_extract(self, query):
1514         mobj = re.match(self._VALID_URL, query)
1515         if mobj is None:
1516             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1517             return
1518
1519         prefix, query = query.split(':')
1520         prefix = prefix[8:]
1521         query = query.encode('utf-8')
1522         if prefix == '':
1523             self._download_n_results(query, 1)
1524             return
1525         elif prefix == 'all':
1526             self._download_n_results(query, self._max_yahoo_results)
1527             return
1528         else:
1529             try:
1530                 n = int(prefix)
1531                 if n <= 0:
1532                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1533                     return
1534                 elif n > self._max_yahoo_results:
1535                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1536                     n = self._max_yahoo_results
1537                 self._download_n_results(query, n)
1538                 return
1539             except ValueError: # parsing prefix as integer fails
1540                 self._download_n_results(query, 1)
1541                 return
1542
1543     def _download_n_results(self, query, n):
1544         """Downloads a specified number of results for a query"""
1545
1546         video_ids = []
1547         already_seen = set()
1548         pagenum = 1
1549
1550         while True:
1551             self.report_download_page(query, pagenum)
1552             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1553             request = compat_urllib_request.Request(result_url)
1554             try:
1555                 page = compat_urllib_request.urlopen(request).read()
1556             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1557                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1558                 return
1559
1560             # Extract video identifiers
1561             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1562                 video_id = mobj.group(1)
1563                 if video_id not in already_seen:
1564                     video_ids.append(video_id)
1565                     already_seen.add(video_id)
1566                     if len(video_ids) == n:
1567                         # Specified n videos reached
1568                         for id in video_ids:
1569                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1570                         return
1571
1572             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573                 for id in video_ids:
1574                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1575                 return
1576
1577             pagenum = pagenum + 1
1578
1579
1580 class YoutubePlaylistIE(InfoExtractor):
1581     """Information Extractor for YouTube playlists."""
1582
1583     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1584     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1585     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1586     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1587     IE_NAME = u'youtube:playlist'
1588
1589     def __init__(self, downloader=None):
1590         InfoExtractor.__init__(self, downloader)
1591
1592     def report_download_page(self, playlist_id, pagenum):
1593         """Report attempt to download playlist page with given number."""
1594         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1595
1596     def _real_extract(self, url):
1597         # Extract playlist id
1598         mobj = re.match(self._VALID_URL, url)
1599         if mobj is None:
1600             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1601             return
1602
1603         # Single video case
1604         if mobj.group(3) is not None:
1605             self._downloader.download([mobj.group(3)])
1606             return
1607
1608         # Download playlist pages
1609         # prefix is 'p' as default for playlists but there are other types that need extra care
1610         playlist_prefix = mobj.group(1)
1611         if playlist_prefix == 'a':
1612             playlist_access = 'artist'
1613         else:
1614             playlist_prefix = 'p'
1615             playlist_access = 'view_play_list'
1616         playlist_id = mobj.group(2)
1617         video_ids = []
1618         pagenum = 1
1619
1620         while True:
1621             self.report_download_page(playlist_id, pagenum)
1622             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1623             request = compat_urllib_request.Request(url)
1624             try:
1625                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1626             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1627                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1628                 return
1629
1630             # Extract video identifiers
1631             ids_in_page = []
1632             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1633                 if mobj.group(1) not in ids_in_page:
1634                     ids_in_page.append(mobj.group(1))
1635             video_ids.extend(ids_in_page)
1636
1637             if self._MORE_PAGES_INDICATOR not in page:
1638                 break
1639             pagenum = pagenum + 1
1640
1641         total = len(video_ids)
1642
1643         playliststart = self._downloader.params.get('playliststart', 1) - 1
1644         playlistend = self._downloader.params.get('playlistend', -1)
1645         if playlistend == -1:
1646             video_ids = video_ids[playliststart:]
1647         else:
1648             video_ids = video_ids[playliststart:playlistend]
1649
1650         if len(video_ids) == total:
1651             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1652         else:
1653             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1654
1655         for id in video_ids:
1656             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1657         return
1658
1659
1660 class YoutubeChannelIE(InfoExtractor):
1661     """Information Extractor for YouTube channels."""
1662
1663     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1664     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1665     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1666     IE_NAME = u'youtube:channel'
1667
1668     def report_download_page(self, channel_id, pagenum):
1669         """Report attempt to download channel page with given number."""
1670         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1671
1672     def _real_extract(self, url):
1673         # Extract channel id
1674         mobj = re.match(self._VALID_URL, url)
1675         if mobj is None:
1676             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1677             return
1678
1679         # Download channel pages
1680         channel_id = mobj.group(1)
1681         video_ids = []
1682         pagenum = 1
1683
1684         while True:
1685             self.report_download_page(channel_id, pagenum)
1686             url = self._TEMPLATE_URL % (channel_id, pagenum)
1687             request = compat_urllib_request.Request(url)
1688             try:
1689                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1690             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1691                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1692                 return
1693
1694             # Extract video identifiers
1695             ids_in_page = []
1696             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1697                 if mobj.group(1) not in ids_in_page:
1698                     ids_in_page.append(mobj.group(1))
1699             video_ids.extend(ids_in_page)
1700
1701             if self._MORE_PAGES_INDICATOR not in page:
1702                 break
1703             pagenum = pagenum + 1
1704
1705         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1706
1707         for id in video_ids:
1708             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1709         return
1710
1711
1712 class YoutubeUserIE(InfoExtractor):
1713     """Information Extractor for YouTube users."""
1714
1715     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1716     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1717     _GDATA_PAGE_SIZE = 50
1718     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1719     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1720     IE_NAME = u'youtube:user'
1721
1722     def __init__(self, downloader=None):
1723         InfoExtractor.__init__(self, downloader)
1724
1725     def report_download_page(self, username, start_index):
1726         """Report attempt to download user page."""
1727         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1728                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1729
1730     def _real_extract(self, url):
1731         # Extract username
1732         mobj = re.match(self._VALID_URL, url)
1733         if mobj is None:
1734             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1735             return
1736
1737         username = mobj.group(1)
1738
1739         # Download video ids using YouTube Data API. Result size per
1740         # query is limited (currently to 50 videos) so we need to query
1741         # page by page until there are no video ids - it means we got
1742         # all of them.
1743
1744         video_ids = []
1745         pagenum = 0
1746
1747         while True:
1748             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1749             self.report_download_page(username, start_index)
1750
1751             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1752
1753             try:
1754                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1755             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1756                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1757                 return
1758
1759             # Extract video identifiers
1760             ids_in_page = []
1761
1762             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1763                 if mobj.group(1) not in ids_in_page:
1764                     ids_in_page.append(mobj.group(1))
1765
1766             video_ids.extend(ids_in_page)
1767
1768             # A little optimization - if current page is not
1769             # "full", ie. does not contain PAGE_SIZE video ids then
1770             # we can assume that this page is the last one - there
1771             # are no more ids on further pages - no need to query
1772             # again.
1773
1774             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1775                 break
1776
1777             pagenum += 1
1778
1779         all_ids_count = len(video_ids)
1780         playliststart = self._downloader.params.get('playliststart', 1) - 1
1781         playlistend = self._downloader.params.get('playlistend', -1)
1782
1783         if playlistend == -1:
1784             video_ids = video_ids[playliststart:]
1785         else:
1786             video_ids = video_ids[playliststart:playlistend]
1787
1788         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1789                 (username, all_ids_count, len(video_ids)))
1790
1791         for video_id in video_ids:
1792             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1793
1794
1795 class BlipTVUserIE(InfoExtractor):
1796     """Information Extractor for blip.tv users."""
1797
1798     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1799     _PAGE_SIZE = 12
1800     IE_NAME = u'blip.tv:user'
1801
1802     def __init__(self, downloader=None):
1803         InfoExtractor.__init__(self, downloader)
1804
1805     def report_download_page(self, username, pagenum):
1806         """Report attempt to download user page."""
1807         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1808                 (self.IE_NAME, username, pagenum))
1809
1810     def _real_extract(self, url):
1811         # Extract username
1812         mobj = re.match(self._VALID_URL, url)
1813         if mobj is None:
1814             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815             return
1816
1817         username = mobj.group(1)
1818
1819         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1820
1821         request = compat_urllib_request.Request(url)
1822
1823         try:
1824             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1825             mobj = re.search(r'data-users-id="([^"]+)"', page)
1826             page_base = page_base % mobj.group(1)
1827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1829             return
1830
1831
1832         # Download video ids using BlipTV Ajax calls. Result size per
1833         # query is limited (currently to 12 videos) so we need to query
1834         # page by page until there are no video ids - it means we got
1835         # all of them.
1836
1837         video_ids = []
1838         pagenum = 1
1839
1840         while True:
1841             self.report_download_page(username, pagenum)
1842
1843             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1844
1845             try:
1846                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1849                 return
1850
1851             # Extract video identifiers
1852             ids_in_page = []
1853
1854             for mobj in re.finditer(r'href="/([^"]+)"', page):
1855                 if mobj.group(1) not in ids_in_page:
1856                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1857
1858             video_ids.extend(ids_in_page)
1859
1860             # A little optimization - if current page is not
1861             # "full", ie. does not contain PAGE_SIZE video ids then
1862             # we can assume that this page is the last one - there
1863             # are no more ids on further pages - no need to query
1864             # again.
1865
1866             if len(ids_in_page) < self._PAGE_SIZE:
1867                 break
1868
1869             pagenum += 1
1870
1871         all_ids_count = len(video_ids)
1872         playliststart = self._downloader.params.get('playliststart', 1) - 1
1873         playlistend = self._downloader.params.get('playlistend', -1)
1874
1875         if playlistend == -1:
1876             video_ids = video_ids[playliststart:]
1877         else:
1878             video_ids = video_ids[playliststart:playlistend]
1879
1880         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1881                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1882
1883         for video_id in video_ids:
1884             self._downloader.download([u'http://blip.tv/'+video_id])
1885
1886
1887 class DepositFilesIE(InfoExtractor):
1888     """Information extractor for depositfiles.com"""
1889
1890     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1891     IE_NAME = u'DepositFiles'
1892
1893     def __init__(self, downloader=None):
1894         InfoExtractor.__init__(self, downloader)
1895
1896     def report_download_webpage(self, file_id):
1897         """Report webpage download."""
1898         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1899
1900     def report_extraction(self, file_id):
1901         """Report information extraction."""
1902         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1903
1904     def _real_extract(self, url):
1905         file_id = url.split('/')[-1]
1906         # Rebuild url in english locale
1907         url = 'http://depositfiles.com/en/files/' + file_id
1908
1909         # Retrieve file webpage with 'Free download' button pressed
1910         free_download_indication = { 'gateway_result' : '1' }
1911         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1912         try:
1913             self.report_download_webpage(file_id)
1914             webpage = compat_urllib_request.urlopen(request).read()
1915         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1916             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1917             return
1918
1919         # Search for the real file URL
1920         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1921         if (mobj is None) or (mobj.group(1) is None):
1922             # Try to figure out reason of the error.
1923             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1924             if (mobj is not None) and (mobj.group(1) is not None):
1925                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1926                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1927             else:
1928                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1929             return
1930
1931         file_url = mobj.group(1)
1932         file_extension = os.path.splitext(file_url)[1][1:]
1933
1934         # Search for file title
1935         mobj = re.search(r'<b title="(.*?)">', webpage)
1936         if mobj is None:
1937             self._downloader.trouble(u'ERROR: unable to extract title')
1938             return
1939         file_title = mobj.group(1).decode('utf-8')
1940
1941         return [{
1942             'id':       file_id.decode('utf-8'),
1943             'url':      file_url.decode('utf-8'),
1944             'uploader': None,
1945             'upload_date':  None,
1946             'title':    file_title,
1947             'ext':      file_extension.decode('utf-8'),
1948         }]
1949
1950
1951 class FacebookIE(InfoExtractor):
1952     """Information Extractor for Facebook"""
1953
1954     _WORKING = False
1955     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1956     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1957     _NETRC_MACHINE = 'facebook'
1958     _available_formats = ['video', 'highqual', 'lowqual']
1959     _video_extensions = {
1960         'video': 'mp4',
1961         'highqual': 'mp4',
1962         'lowqual': 'mp4',
1963     }
1964     IE_NAME = u'facebook'
1965
1966     def __init__(self, downloader=None):
1967         InfoExtractor.__init__(self, downloader)
1968
1969     def _reporter(self, message):
1970         """Add header and report message."""
1971         self._downloader.to_screen(u'[facebook] %s' % message)
1972
1973     def report_login(self):
1974         """Report attempt to log in."""
1975         self._reporter(u'Logging in')
1976
1977     def report_video_webpage_download(self, video_id):
1978         """Report attempt to download video webpage."""
1979         self._reporter(u'%s: Downloading video webpage' % video_id)
1980
1981     def report_information_extraction(self, video_id):
1982         """Report attempt to extract video information."""
1983         self._reporter(u'%s: Extracting video information' % video_id)
1984
1985     def _parse_page(self, video_webpage):
1986         """Extract video information from page"""
1987         # General data
1988         data = {'title': r'\("video_title", "(.*?)"\)',
1989             'description': r'<div class="datawrap">(.*?)</div>',
1990             'owner': r'\("video_owner_name", "(.*?)"\)',
1991             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1992             }
1993         video_info = {}
1994         for piece in data.keys():
1995             mobj = re.search(data[piece], video_webpage)
1996             if mobj is not None:
1997                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
1998
1999         # Video urls
2000         video_urls = {}
2001         for fmt in self._available_formats:
2002             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2003             if mobj is not None:
2004                 # URL is in a Javascript segment inside an escaped Unicode format within
2005                 # the generally utf-8 page
2006                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2007         video_info['video_urls'] = video_urls
2008
2009         return video_info
2010
2011     def _real_initialize(self):
2012         if self._downloader is None:
2013             return
2014
2015         useremail = None
2016         password = None
2017         downloader_params = self._downloader.params
2018
2019         # Attempt to use provided username and password or .netrc data
2020         if downloader_params.get('username', None) is not None:
2021             useremail = downloader_params['username']
2022             password = downloader_params['password']
2023         elif downloader_params.get('usenetrc', False):
2024             try:
2025                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2026                 if info is not None:
2027                     useremail = info[0]
2028                     password = info[2]
2029                 else:
2030                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2031             except (IOError, netrc.NetrcParseError) as err:
2032                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2033                 return
2034
2035         if useremail is None:
2036             return
2037
2038         # Log in
2039         login_form = {
2040             'email': useremail,
2041             'pass': password,
2042             'login': 'Log+In'
2043             }
2044         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2045         try:
2046             self.report_login()
2047             login_results = compat_urllib_request.urlopen(request).read()
2048             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2049                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2050                 return
2051         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2052             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2053             return
2054
2055     def _real_extract(self, url):
2056         mobj = re.match(self._VALID_URL, url)
2057         if mobj is None:
2058             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2059             return
2060         video_id = mobj.group('ID')
2061
2062         # Get video webpage
2063         self.report_video_webpage_download(video_id)
2064         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2065         try:
2066             page = compat_urllib_request.urlopen(request)
2067             video_webpage = page.read()
2068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2070             return
2071
2072         # Start extracting information
2073         self.report_information_extraction(video_id)
2074
2075         # Extract information
2076         video_info = self._parse_page(video_webpage)
2077
2078         # uploader
2079         if 'owner' not in video_info:
2080             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2081             return
2082         video_uploader = video_info['owner']
2083
2084         # title
2085         if 'title' not in video_info:
2086             self._downloader.trouble(u'ERROR: unable to extract video title')
2087             return
2088         video_title = video_info['title']
2089         video_title = video_title.decode('utf-8')
2090
2091         # thumbnail image
2092         if 'thumbnail' not in video_info:
2093             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2094             video_thumbnail = ''
2095         else:
2096             video_thumbnail = video_info['thumbnail']
2097
2098         # upload date
2099         upload_date = None
2100         if 'upload_date' in video_info:
2101             upload_time = video_info['upload_date']
2102             timetuple = email.utils.parsedate_tz(upload_time)
2103             if timetuple is not None:
2104                 try:
2105                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2106                 except:
2107                     pass
2108
2109         # description
2110         video_description = video_info.get('description', 'No description available.')
2111
2112         url_map = video_info['video_urls']
2113         if len(url_map.keys()) > 0:
2114             # Decide which formats to download
2115             req_format = self._downloader.params.get('format', None)
2116             format_limit = self._downloader.params.get('format_limit', None)
2117
2118             if format_limit is not None and format_limit in self._available_formats:
2119                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2120             else:
2121                 format_list = self._available_formats
2122             existing_formats = [x for x in format_list if x in url_map]
2123             if len(existing_formats) == 0:
2124                 self._downloader.trouble(u'ERROR: no known formats available for video')
2125                 return
2126             if req_format is None:
2127                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2128             elif req_format == 'worst':
2129                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2130             elif req_format == '-1':
2131                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2132             else:
2133                 # Specific format
2134                 if req_format not in url_map:
2135                     self._downloader.trouble(u'ERROR: requested format not available')
2136                     return
2137                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2138
2139         results = []
2140         for format_param, video_real_url in video_url_list:
2141             # Extension
2142             video_extension = self._video_extensions.get(format_param, 'mp4')
2143
2144             results.append({
2145                 'id':       video_id.decode('utf-8'),
2146                 'url':      video_real_url.decode('utf-8'),
2147                 'uploader': video_uploader.decode('utf-8'),
2148                 'upload_date':  upload_date,
2149                 'title':    video_title,
2150                 'ext':      video_extension.decode('utf-8'),
2151                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2152                 'thumbnail':    video_thumbnail.decode('utf-8'),
2153                 'description':  video_description.decode('utf-8'),
2154             })
2155         return results
2156
2157 class BlipTVIE(InfoExtractor):
2158     """Information extractor for blip.tv"""
2159
2160     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2161     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2162     IE_NAME = u'blip.tv'
2163
2164     def report_extraction(self, file_id):
2165         """Report information extraction."""
2166         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2167
2168     def report_direct_download(self, title):
2169         """Report information extraction."""
2170         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2171
2172     def _real_extract(self, url):
2173         mobj = re.match(self._VALID_URL, url)
2174         if mobj is None:
2175             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2176             return
2177
2178         if '?' in url:
2179             cchar = '&'
2180         else:
2181             cchar = '?'
2182         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2183         request = compat_urllib_request.Request(json_url)
2184         self.report_extraction(mobj.group(1))
2185         info = None
2186         try:
2187             urlh = compat_urllib_request.urlopen(request)
2188             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2189                 basename = url.split('/')[-1]
2190                 title,ext = os.path.splitext(basename)
2191                 title = title.decode('UTF-8')
2192                 ext = ext.replace('.', '')
2193                 self.report_direct_download(title)
2194                 info = {
2195                     'id': title,
2196                     'url': url,
2197                     'uploader': None,
2198                     'upload_date': None,
2199                     'title': title,
2200                     'ext': ext,
2201                     'urlhandle': urlh
2202                 }
2203         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2204             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2205             return
2206         if info is None: # Regular URL
2207             try:
2208                 json_code_bytes = urlh.read()
2209                 json_code = json_code_bytes.decode('utf-8')
2210             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2211                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2212                 return
2213
2214             try:
2215                 json_data = json.loads(json_code)
2216                 if 'Post' in json_data:
2217                     data = json_data['Post']
2218                 else:
2219                     data = json_data
2220
2221                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2222                 video_url = data['media']['url']
2223                 umobj = re.match(self._URL_EXT, video_url)
2224                 if umobj is None:
2225                     raise ValueError('Can not determine filename extension')
2226                 ext = umobj.group(1)
2227
2228                 info = {
2229                     'id': data['item_id'],
2230                     'url': video_url,
2231                     'uploader': data['display_name'],
2232                     'upload_date': upload_date,
2233                     'title': data['title'],
2234                     'ext': ext,
2235                     'format': data['media']['mimeType'],
2236                     'thumbnail': data['thumbnailUrl'],
2237                     'description': data['description'],
2238                     'player_url': data['embedUrl']
2239                 }
2240             except (ValueError,KeyError) as err:
2241                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2242                 return
2243
2244         std_headers['User-Agent'] = 'iTunes/10.6.1'
2245         return [info]
2246
2247
2248 class MyVideoIE(InfoExtractor):
2249     """Information Extractor for myvideo.de."""
2250
2251     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2252     IE_NAME = u'myvideo'
2253
2254     def __init__(self, downloader=None):
2255         InfoExtractor.__init__(self, downloader)
2256     
2257     def report_download_webpage(self, video_id):
2258         """Report webpage download."""
2259         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2260
2261     def report_extraction(self, video_id):
2262         """Report information extraction."""
2263         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2264
2265     def _real_extract(self,url):
2266         mobj = re.match(self._VALID_URL, url)
2267         if mobj is None:
2268             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2269             return
2270
2271         video_id = mobj.group(1)
2272
2273         # Get video webpage
2274         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2275         try:
2276             self.report_download_webpage(video_id)
2277             webpage = compat_urllib_request.urlopen(request).read()
2278         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2279             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2280             return
2281
2282         self.report_extraction(video_id)
2283         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2284                  webpage)
2285         if mobj is None:
2286             self._downloader.trouble(u'ERROR: unable to extract media URL')
2287             return
2288         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2289
2290         mobj = re.search('<title>([^<]+)</title>', webpage)
2291         if mobj is None:
2292             self._downloader.trouble(u'ERROR: unable to extract title')
2293             return
2294
2295         video_title = mobj.group(1)
2296
2297         return [{
2298             'id':       video_id,
2299             'url':      video_url,
2300             'uploader': None,
2301             'upload_date':  None,
2302             'title':    video_title,
2303             'ext':      u'flv',
2304         }]
2305
2306 class ComedyCentralIE(InfoExtractor):
2307     """Information extractor for The Daily Show and Colbert Report """
2308
2309     # urls can be abbreviations like :thedailyshow or :colbert
2310     # urls for episodes like: 
2311     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2312     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2313     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524    
2314     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2315                       |(https?://)?(www\.)?
2316                           (?P<showname>thedailyshow|colbertnation)\.com/
2317                          (full-episodes/(?P<episode>.*)|
2318                           (?P<clip>
2319                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2320                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2321                      $"""                        
2322     IE_NAME = u'comedycentral'
2323
2324     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2325
2326     _video_extensions = {
2327         '3500': 'mp4',
2328         '2200': 'mp4',
2329         '1700': 'mp4',
2330         '1200': 'mp4',
2331         '750': 'mp4',
2332         '400': 'mp4',
2333     }
2334     _video_dimensions = {
2335         '3500': '1280x720',
2336         '2200': '960x540',
2337         '1700': '768x432',
2338         '1200': '640x360',
2339         '750': '512x288',
2340         '400': '384x216',
2341     }
2342
2343     def suitable(self, url):
2344         """Receives a URL and returns True if suitable for this IE."""
2345         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2346
2347     def report_extraction(self, episode_id):
2348         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2349
2350     def report_config_download(self, episode_id):
2351         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2352
2353     def report_index_download(self, episode_id):
2354         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2355
2356     def report_player_url(self, episode_id):
2357         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2358
2359
2360     def _print_formats(self, formats):
2361         print('Available formats:')
2362         for x in formats:
2363             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2364
2365
2366     def _real_extract(self, url):
2367         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2368         if mobj is None:
2369             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2370             return
2371
2372         if mobj.group('shortname'):
2373             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2374                 url = u'http://www.thedailyshow.com/full-episodes/'
2375             else:
2376                 url = u'http://www.colbertnation.com/full-episodes/'
2377             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2378             assert mobj is not None
2379
2380         if mobj.group('clip'):
2381             if mobj.group('showname') == 'thedailyshow':
2382                 epTitle = mobj.group('tdstitle')
2383             else:
2384                 epTitle = mobj.group('cntitle')
2385             dlNewest = False
2386         else:
2387             dlNewest = not mobj.group('episode')
2388             if dlNewest:
2389                 epTitle = mobj.group('showname')
2390             else:
2391                 epTitle = mobj.group('episode')
2392
2393         req = compat_urllib_request.Request(url)
2394         self.report_extraction(epTitle)
2395         try:
2396             htmlHandle = compat_urllib_request.urlopen(req)
2397             html = htmlHandle.read()
2398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2399             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2400             return
2401         if dlNewest:
2402             url = htmlHandle.geturl()
2403             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2404             if mobj is None:
2405                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2406                 return
2407             if mobj.group('episode') == '':
2408                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2409                 return
2410             epTitle = mobj.group('episode')
2411
2412         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2413
2414         if len(mMovieParams) == 0:
2415             # The Colbert Report embeds the information in a without
2416             # a URL prefix; so extract the alternate reference
2417             # and then add the URL prefix manually.
2418
2419             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2420             if len(altMovieParams) == 0:
2421                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2422                 return
2423             else:
2424                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2425         
2426         playerUrl_raw = mMovieParams[0][0]
2427         self.report_player_url(epTitle)
2428         try:
2429             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2430             playerUrl = urlHandle.geturl()
2431         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2432             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2433             return
2434
2435         uri = mMovieParams[0][1]
2436         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2437         self.report_index_download(epTitle)
2438         try:
2439             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2440         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2442             return
2443
2444         results = []
2445
2446         idoc = xml.etree.ElementTree.fromstring(indexXml)
2447         itemEls = idoc.findall('.//item')
2448         for itemEl in itemEls:
2449             mediaId = itemEl.findall('./guid')[0].text
2450             shortMediaId = mediaId.split(':')[-1]
2451             showId = mediaId.split(':')[-2].replace('.com', '')
2452             officialTitle = itemEl.findall('./title')[0].text
2453             officialDate = itemEl.findall('./pubDate')[0].text
2454
2455             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2456                         compat_urllib_parse.urlencode({'uri': mediaId}))
2457             configReq = compat_urllib_request.Request(configUrl)
2458             self.report_config_download(epTitle)
2459             try:
2460                 configXml = compat_urllib_request.urlopen(configReq).read()
2461             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2462                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2463                 return
2464
2465             cdoc = xml.etree.ElementTree.fromstring(configXml)
2466             turls = []
2467             for rendition in cdoc.findall('.//rendition'):
2468                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2469                 turls.append(finfo)
2470
2471             if len(turls) == 0:
2472                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2473                 continue
2474             
2475             if self._downloader.params.get('listformats', None):
2476                 self._print_formats([i[0] for i in turls])
2477                 return
2478
2479             # For now, just pick the highest bitrate
2480             format,video_url = turls[-1]
2481
2482             # Get the format arg from the arg stream
2483             req_format = self._downloader.params.get('format', None)
2484
2485             # Select format if we can find one
2486             for f,v in turls:
2487                 if f == req_format:
2488                     format, video_url = f, v
2489                     break
2490
2491             # Patch to download from alternative CDN, which does not
2492             # break on current RTMPDump builds
2493             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2494             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2495
2496             if video_url.startswith(broken_cdn):
2497                 video_url = video_url.replace(broken_cdn, better_cdn)
2498
2499             effTitle = showId + u'-' + epTitle
2500             info = {
2501                 'id': shortMediaId,
2502                 'url': video_url,
2503                 'uploader': showId,
2504                 'upload_date': officialDate,
2505                 'title': effTitle,
2506                 'ext': 'mp4',
2507                 'format': format,
2508                 'thumbnail': None,
2509                 'description': officialTitle,
2510                 'player_url': None #playerUrl
2511             }
2512
2513             results.append(info)
2514             
2515         return results
2516
2517
2518 class EscapistIE(InfoExtractor):
2519     """Information extractor for The Escapist """
2520
2521     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2522     IE_NAME = u'escapist'
2523
2524     def report_extraction(self, showName):
2525         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2526
2527     def report_config_download(self, showName):
2528         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2529
2530     def _real_extract(self, url):
2531         mobj = re.match(self._VALID_URL, url)
2532         if mobj is None:
2533             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2534             return
2535         showName = mobj.group('showname')
2536         videoId = mobj.group('episode')
2537
2538         self.report_extraction(showName)
2539         try:
2540             webPage = compat_urllib_request.urlopen(url)
2541             webPageBytes = webPage.read()
2542             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2543             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2545             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2546             return
2547
2548         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2549         description = unescapeHTML(descMatch.group(1))
2550         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2551         imgUrl = unescapeHTML(imgMatch.group(1))
2552         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2553         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2554         configUrlMatch = re.search('config=(.*)$', playerUrl)
2555         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2556
2557         self.report_config_download(showName)
2558         try:
2559             configJSON = compat_urllib_request.urlopen(configUrl).read()
2560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2561             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2562             return
2563
2564         # Technically, it's JavaScript, not JSON
2565         configJSON = configJSON.replace("'", '"')
2566
2567         try:
2568             config = json.loads(configJSON)
2569         except (ValueError,) as err:
2570             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2571             return
2572
2573         playlist = config['playlist']
2574         videoUrl = playlist[1]['url']
2575
2576         info = {
2577             'id': videoId,
2578             'url': videoUrl,
2579             'uploader': showName,
2580             'upload_date': None,
2581             'title': showName,
2582             'ext': 'flv',
2583             'thumbnail': imgUrl,
2584             'description': description,
2585             'player_url': playerUrl,
2586         }
2587
2588         return [info]
2589
2590
2591 class CollegeHumorIE(InfoExtractor):
2592     """Information extractor for collegehumor.com"""
2593
2594     _WORKING = False
2595     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2596     IE_NAME = u'collegehumor'
2597
2598     def report_manifest(self, video_id):
2599         """Report information extraction."""
2600         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2601
2602     def report_extraction(self, video_id):
2603         """Report information extraction."""
2604         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2605
2606     def _real_extract(self, url):
2607         mobj = re.match(self._VALID_URL, url)
2608         if mobj is None:
2609             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2610             return
2611         video_id = mobj.group('videoid')
2612
2613         info = {
2614             'id': video_id,
2615             'uploader': None,
2616             'upload_date': None,
2617         }
2618
2619         self.report_extraction(video_id)
2620         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2621         try:
2622             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2623         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2624             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2625             return
2626
2627         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2628         try:
2629             videoNode = mdoc.findall('./video')[0]
2630             info['description'] = videoNode.findall('./description')[0].text
2631             info['title'] = videoNode.findall('./caption')[0].text
2632             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2633             manifest_url = videoNode.findall('./file')[0].text
2634         except IndexError:
2635             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2636             return
2637
2638         manifest_url += '?hdcore=2.10.3'
2639         self.report_manifest(video_id)
2640         try:
2641             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2642         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2644             return
2645
2646         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2647         try:
2648             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2649             node_id = media_node.attrib['url']
2650             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2651         except IndexError as err:
2652             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2653             return
2654
2655         url_pr = compat_urllib_parse_urlparse(manifest_url)
2656         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2657
2658         info['url'] = url
2659         info['ext'] = 'f4f'
2660         return [info]
2661
2662
2663 class XVideosIE(InfoExtractor):
2664     """Information extractor for xvideos.com"""
2665
2666     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2667     IE_NAME = u'xvideos'
2668
2669     def report_webpage(self, video_id):
2670         """Report information extraction."""
2671         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2672
2673     def report_extraction(self, video_id):
2674         """Report information extraction."""
2675         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2676
2677     def _real_extract(self, url):
2678         mobj = re.match(self._VALID_URL, url)
2679         if mobj is None:
2680             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2681             return
2682         video_id = mobj.group(1)
2683
2684         self.report_webpage(video_id)
2685
2686         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2687         try:
2688             webpage_bytes = compat_urllib_request.urlopen(request).read()
2689             webpage = webpage_bytes.decode('utf-8', 'replace')
2690         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2692             return
2693
2694         self.report_extraction(video_id)
2695
2696
2697         # Extract video URL
2698         mobj = re.search(r'flv_url=(.+?)&', webpage)
2699         if mobj is None:
2700             self._downloader.trouble(u'ERROR: unable to extract video url')
2701             return
2702         video_url = compat_urllib_parse.unquote(mobj.group(1))
2703
2704
2705         # Extract title
2706         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2707         if mobj is None:
2708             self._downloader.trouble(u'ERROR: unable to extract video title')
2709             return
2710         video_title = mobj.group(1)
2711
2712
2713         # Extract video thumbnail
2714         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2715         if mobj is None:
2716             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2717             return
2718         video_thumbnail = mobj.group(0)
2719
2720         info = {
2721             'id': video_id,
2722             'url': video_url,
2723             'uploader': None,
2724             'upload_date': None,
2725             'title': video_title,
2726             'ext': 'flv',
2727             'thumbnail': video_thumbnail,
2728             'description': None,
2729         }
2730
2731         return [info]
2732
2733
2734 class SoundcloudIE(InfoExtractor):
2735     """Information extractor for soundcloud.com
2736        To access the media, the uid of the song and a stream token
2737        must be extracted from the page source and the script must make
2738        a request to media.soundcloud.com/crossdomain.xml. Then
2739        the media can be grabbed by requesting from an url composed
2740        of the stream token and uid
2741      """
2742
2743     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2744     IE_NAME = u'soundcloud'
2745
2746     def __init__(self, downloader=None):
2747         InfoExtractor.__init__(self, downloader)
2748
2749     def report_resolve(self, video_id):
2750         """Report information extraction."""
2751         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2752
2753     def report_extraction(self, video_id):
2754         """Report information extraction."""
2755         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2756
2757     def _real_extract(self, url):
2758         mobj = re.match(self._VALID_URL, url)
2759         if mobj is None:
2760             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2761             return
2762
2763         # extract uploader (which is in the url)
2764         uploader = mobj.group(1)
2765         # extract simple title (uploader + slug of song title)
2766         slug_title =  mobj.group(2)
2767         simple_title = uploader + u'-' + slug_title
2768
2769         self.report_resolve('%s/%s' % (uploader, slug_title))
2770
2771         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2772         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2773         request = compat_urllib_request.Request(resolv_url)
2774         try:
2775             info_json_bytes = compat_urllib_request.urlopen(request).read()
2776             info_json = info_json_bytes.decode('utf-8')
2777         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2779             return
2780
2781         info = json.loads(info_json)
2782         video_id = info['id']
2783         self.report_extraction('%s/%s' % (uploader, slug_title))
2784
2785         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2786         request = compat_urllib_request.Request(streams_url)
2787         try:
2788             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2789             stream_json = stream_json_bytes.decode('utf-8')
2790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2791             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2792             return
2793
2794         streams = json.loads(stream_json)
2795         mediaURL = streams['http_mp3_128_url']
2796
2797         return [{
2798             'id':       info['id'],
2799             'url':      mediaURL,
2800             'uploader': info['user']['username'],
2801             'upload_date':  info['created_at'],
2802             'title':    info['title'],
2803             'ext':      u'mp3',
2804             'description': info['description'],
2805         }]
2806
2807
2808 class InfoQIE(InfoExtractor):
2809     """Information extractor for infoq.com"""
2810
2811     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2812     IE_NAME = u'infoq'
2813
2814     def report_webpage(self, video_id):
2815         """Report information extraction."""
2816         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2817
2818     def report_extraction(self, video_id):
2819         """Report information extraction."""
2820         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2821
2822     def _real_extract(self, url):
2823         mobj = re.match(self._VALID_URL, url)
2824         if mobj is None:
2825             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2826             return
2827
2828         self.report_webpage(url)
2829
2830         request = compat_urllib_request.Request(url)
2831         try:
2832             webpage = compat_urllib_request.urlopen(request).read()
2833         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2834             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2835             return
2836
2837         self.report_extraction(url)
2838
2839
2840         # Extract video URL
2841         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2842         if mobj is None:
2843             self._downloader.trouble(u'ERROR: unable to extract video url')
2844             return
2845         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2846
2847
2848         # Extract title
2849         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2850         if mobj is None:
2851             self._downloader.trouble(u'ERROR: unable to extract video title')
2852             return
2853         video_title = mobj.group(1).decode('utf-8')
2854
2855         # Extract description
2856         video_description = u'No description available.'
2857         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2858         if mobj is not None:
2859             video_description = mobj.group(1).decode('utf-8')
2860
2861         video_filename = video_url.split('/')[-1]
2862         video_id, extension = video_filename.split('.')
2863
2864         info = {
2865             'id': video_id,
2866             'url': video_url,
2867             'uploader': None,
2868             'upload_date': None,
2869             'title': video_title,
2870             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2871             'thumbnail': None,
2872             'description': video_description,
2873         }
2874
2875         return [info]
2876
2877 class MixcloudIE(InfoExtractor):
2878     """Information extractor for www.mixcloud.com"""
2879     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2880     IE_NAME = u'mixcloud'
2881
2882     def __init__(self, downloader=None):
2883         InfoExtractor.__init__(self, downloader)
2884
2885     def report_download_json(self, file_id):
2886         """Report JSON download."""
2887         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2888
2889     def report_extraction(self, file_id):
2890         """Report information extraction."""
2891         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2892
2893     def get_urls(self, jsonData, fmt, bitrate='best'):
2894         """Get urls from 'audio_formats' section in json"""
2895         file_url = None
2896         try:
2897             bitrate_list = jsonData[fmt]
2898             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2899                 bitrate = max(bitrate_list) # select highest
2900
2901             url_list = jsonData[fmt][bitrate]
2902         except TypeError: # we have no bitrate info.
2903             url_list = jsonData[fmt]
2904         return url_list
2905
2906     def check_urls(self, url_list):
2907         """Returns 1st active url from list"""
2908         for url in url_list:
2909             try:
2910                 compat_urllib_request.urlopen(url)
2911                 return url
2912             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2913                 url = None
2914
2915         return None
2916
2917     def _print_formats(self, formats):
2918         print('Available formats:')
2919         for fmt in formats.keys():
2920             for b in formats[fmt]:
2921                 try:
2922                     ext = formats[fmt][b][0]
2923                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2924                 except TypeError: # we have no bitrate info
2925                     ext = formats[fmt][0]
2926                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2927                     break
2928
2929     def _real_extract(self, url):
2930         mobj = re.match(self._VALID_URL, url)
2931         if mobj is None:
2932             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2933             return
2934         # extract uploader & filename from url
2935         uploader = mobj.group(1).decode('utf-8')
2936         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2937
2938         # construct API request
2939         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2940         # retrieve .json file with links to files
2941         request = compat_urllib_request.Request(file_url)
2942         try:
2943             self.report_download_json(file_url)
2944             jsonData = compat_urllib_request.urlopen(request).read()
2945         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2946             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2947             return
2948
2949         # parse JSON
2950         json_data = json.loads(jsonData)
2951         player_url = json_data['player_swf_url']
2952         formats = dict(json_data['audio_formats'])
2953
2954         req_format = self._downloader.params.get('format', None)
2955         bitrate = None
2956
2957         if self._downloader.params.get('listformats', None):
2958             self._print_formats(formats)
2959             return
2960
2961         if req_format is None or req_format == 'best':
2962             for format_param in formats.keys():
2963                 url_list = self.get_urls(formats, format_param)
2964                 # check urls
2965                 file_url = self.check_urls(url_list)
2966                 if file_url is not None:
2967                     break # got it!
2968         else:
2969             if req_format not in formats.keys():
2970                 self._downloader.trouble(u'ERROR: format is not available')
2971                 return
2972
2973             url_list = self.get_urls(formats, req_format)
2974             file_url = self.check_urls(url_list)
2975             format_param = req_format
2976
2977         return [{
2978             'id': file_id.decode('utf-8'),
2979             'url': file_url.decode('utf-8'),
2980             'uploader': uploader.decode('utf-8'),
2981             'upload_date': None,
2982             'title': json_data['name'],
2983             'ext': file_url.split('.')[-1].decode('utf-8'),
2984             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2985             'thumbnail': json_data['thumbnail_url'],
2986             'description': json_data['description'],
2987             'player_url': player_url.decode('utf-8'),
2988         }]
2989
2990 class StanfordOpenClassroomIE(InfoExtractor):
2991     """Information extractor for Stanford's Open ClassRoom"""
2992
2993     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2994     IE_NAME = u'stanfordoc'
2995
2996     def report_download_webpage(self, objid):
2997         """Report information extraction."""
2998         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2999
3000     def report_extraction(self, video_id):
3001         """Report information extraction."""
3002         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3003
3004     def _real_extract(self, url):
3005         mobj = re.match(self._VALID_URL, url)
3006         if mobj is None:
3007             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3008             return
3009
3010         if mobj.group('course') and mobj.group('video'): # A specific video
3011             course = mobj.group('course')
3012             video = mobj.group('video')
3013             info = {
3014                 'id': course + '_' + video,
3015                 'uploader': None,
3016                 'upload_date': None,
3017             }
3018
3019             self.report_extraction(info['id'])
3020             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3021             xmlUrl = baseUrl + video + '.xml'
3022             try:
3023                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3024             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3026                 return
3027             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3028             try:
3029                 info['title'] = mdoc.findall('./title')[0].text
3030                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3031             except IndexError:
3032                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3033                 return
3034             info['ext'] = info['url'].rpartition('.')[2]
3035             return [info]
3036         elif mobj.group('course'): # A course page
3037             course = mobj.group('course')
3038             info = {
3039                 'id': course,
3040                 'type': 'playlist',
3041                 'uploader': None,
3042                 'upload_date': None,
3043             }
3044
3045             self.report_download_webpage(info['id'])
3046             try:
3047                 coursepage = compat_urllib_request.urlopen(url).read()
3048             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3049                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3050                 return
3051
3052             m = re.search('<h1>([^<]+)</h1>', coursepage)
3053             if m:
3054                 info['title'] = unescapeHTML(m.group(1))
3055             else:
3056                 info['title'] = info['id']
3057
3058             m = re.search('<description>([^<]+)</description>', coursepage)
3059             if m:
3060                 info['description'] = unescapeHTML(m.group(1))
3061
3062             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3063             info['list'] = [
3064                 {
3065                     'type': 'reference',
3066                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3067                 }
3068                     for vpage in links]
3069             results = []
3070             for entry in info['list']:
3071                 assert entry['type'] == 'reference'
3072                 results += self.extract(entry['url'])
3073             return results
3074             
3075         else: # Root page
3076             info = {
3077                 'id': 'Stanford OpenClassroom',
3078                 'type': 'playlist',
3079                 'uploader': None,
3080                 'upload_date': None,
3081             }
3082
3083             self.report_download_webpage(info['id'])
3084             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3085             try:
3086                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3087             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3089                 return
3090
3091             info['title'] = info['id']
3092
3093             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3094             info['list'] = [
3095                 {
3096                     'type': 'reference',
3097                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3098                 }
3099                     for cpage in links]
3100
3101             results = []
3102             for entry in info['list']:
3103                 assert entry['type'] == 'reference'
3104                 results += self.extract(entry['url'])
3105             return results
3106
3107 class MTVIE(InfoExtractor):
3108     """Information extractor for MTV.com"""
3109
3110     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3111     IE_NAME = u'mtv'
3112
3113     def report_webpage(self, video_id):
3114         """Report information extraction."""
3115         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3116
3117     def report_extraction(self, video_id):
3118         """Report information extraction."""
3119         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3120
3121     def _real_extract(self, url):
3122         mobj = re.match(self._VALID_URL, url)
3123         if mobj is None:
3124             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3125             return
3126         if not mobj.group('proto'):
3127             url = 'http://' + url
3128         video_id = mobj.group('videoid')
3129         self.report_webpage(video_id)
3130
3131         request = compat_urllib_request.Request(url)
3132         try:
3133             webpage = compat_urllib_request.urlopen(request).read()
3134         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3135             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3136             return
3137
3138         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3139         if mobj is None:
3140             self._downloader.trouble(u'ERROR: unable to extract song name')
3141             return
3142         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3143         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3144         if mobj is None:
3145             self._downloader.trouble(u'ERROR: unable to extract performer')
3146             return
3147         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3148         video_title = performer + ' - ' + song_name 
3149
3150         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3151         if mobj is None:
3152             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3153             return
3154         mtvn_uri = mobj.group(1)
3155
3156         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3157         if mobj is None:
3158             self._downloader.trouble(u'ERROR: unable to extract content id')
3159             return
3160         content_id = mobj.group(1)
3161
3162         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3163         self.report_extraction(video_id)
3164         request = compat_urllib_request.Request(videogen_url)
3165         try:
3166             metadataXml = compat_urllib_request.urlopen(request).read()
3167         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3168             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3169             return
3170
3171         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3172         renditions = mdoc.findall('.//rendition')
3173
3174         # For now, always pick the highest quality.
3175         rendition = renditions[-1]
3176
3177         try:
3178             _,_,ext = rendition.attrib['type'].partition('/')
3179             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3180             video_url = rendition.find('./src').text
3181         except KeyError:
3182             self._downloader.trouble('Invalid rendition field.')
3183             return
3184
3185         info = {
3186             'id': video_id,
3187             'url': video_url,
3188             'uploader': performer,
3189             'upload_date': None,
3190             'title': video_title,
3191             'ext': ext,
3192             'format': format,
3193         }
3194
3195         return [info]
3196
3197
3198 class YoukuIE(InfoExtractor):
3199
3200     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3201     IE_NAME = u'Youku'
3202
3203     def __init__(self, downloader=None):
3204         InfoExtractor.__init__(self, downloader)
3205
3206     def report_download_webpage(self, file_id):
3207         """Report webpage download."""
3208         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3209
3210     def report_extraction(self, file_id):
3211         """Report information extraction."""
3212         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3213
3214     def _gen_sid(self):
3215         nowTime = int(time.time() * 1000)
3216         random1 = random.randint(1000,1998)
3217         random2 = random.randint(1000,9999)
3218
3219         return "%d%d%d" %(nowTime,random1,random2)
3220
3221     def _get_file_ID_mix_string(self, seed):
3222         mixed = []
3223         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3224         seed = float(seed)
3225         for i in range(len(source)):
3226             seed  =  (seed * 211 + 30031 ) % 65536
3227             index  =  math.floor(seed / 65536 * len(source) )
3228             mixed.append(source[int(index)])
3229             source.remove(source[int(index)])
3230         #return ''.join(mixed)
3231         return mixed
3232
3233     def _get_file_id(self, fileId, seed):
3234         mixed = self._get_file_ID_mix_string(seed)
3235         ids = fileId.split('*')
3236         realId = []
3237         for ch in ids:
3238             if ch:
3239                 realId.append(mixed[int(ch)])
3240         return ''.join(realId)
3241
3242     def _real_extract(self, url):
3243         mobj = re.match(self._VALID_URL, url)
3244         if mobj is None:
3245             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3246             return
3247         video_id = mobj.group('ID')
3248
3249         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3250
3251         request = compat_urllib_request.Request(info_url, None, std_headers)
3252         try:
3253             self.report_download_webpage(video_id)
3254             jsondata = compat_urllib_request.urlopen(request).read()
3255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3256             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3257             return
3258
3259         self.report_extraction(video_id)
3260         try:
3261             jsonstr = jsondata.decode('utf-8')
3262             config = json.loads(jsonstr)
3263
3264             video_title =  config['data'][0]['title']
3265             seed = config['data'][0]['seed']
3266
3267             format = self._downloader.params.get('format', None)
3268             supported_format = config['data'][0]['streamfileids'].keys()
3269
3270             if format is None or format == 'best':
3271                 if 'hd2' in supported_format:
3272                     format = 'hd2'
3273                 else:
3274                     format = 'flv'
3275                 ext = u'flv'
3276             elif format == 'worst':
3277                 format = 'mp4'
3278                 ext = u'mp4'
3279             else:
3280                 format = 'flv'
3281                 ext = u'flv'
3282
3283
3284             fileid = config['data'][0]['streamfileids'][format]
3285             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3286         except (UnicodeDecodeError, ValueError, KeyError):
3287             self._downloader.trouble(u'ERROR: unable to extract info section')
3288             return
3289
3290         files_info=[]
3291         sid = self._gen_sid()
3292         fileid = self._get_file_id(fileid, seed)
3293
3294         #column 8,9 of fileid represent the segment number
3295         #fileid[7:9] should be changed
3296         for index, key in enumerate(keys):
3297
3298             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3299             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3300
3301             info = {
3302                 'id': '%s_part%02d' % (video_id, index),
3303                 'url': download_url,
3304                 'uploader': None,
3305                 'upload_date': None,
3306                 'title': video_title,
3307                 'ext': ext,
3308             }
3309             files_info.append(info)
3310
3311         return files_info
3312
3313
3314 class XNXXIE(InfoExtractor):
3315     """Information extractor for xnxx.com"""
3316
3317     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3318     IE_NAME = u'xnxx'
3319     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3320     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3321     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3322
3323     def report_webpage(self, video_id):
3324         """Report information extraction"""
3325         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3326
3327     def report_extraction(self, video_id):
3328         """Report information extraction"""
3329         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3330
3331     def _real_extract(self, url):
3332         mobj = re.match(self._VALID_URL, url)
3333         if mobj is None:
3334             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3335             return
3336         video_id = mobj.group(1)
3337
3338         self.report_webpage(video_id)
3339
3340         # Get webpage content
3341         try:
3342             webpage_bytes = compat_urllib_request.urlopen(url).read()
3343             webpage = webpage_bytes.decode('utf-8')
3344         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3345             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3346             return
3347
3348         result = re.search(self.VIDEO_URL_RE, webpage)
3349         if result is None:
3350             self._downloader.trouble(u'ERROR: unable to extract video url')
3351             return
3352         video_url = compat_urllib_parse.unquote(result.group(1))
3353
3354         result = re.search(self.VIDEO_TITLE_RE, webpage)
3355         if result is None:
3356             self._downloader.trouble(u'ERROR: unable to extract video title')
3357             return
3358         video_title = result.group(1)
3359
3360         result = re.search(self.VIDEO_THUMB_RE, webpage)
3361         if result is None:
3362             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3363             return
3364         video_thumbnail = result.group(1)
3365
3366         return [{
3367             'id': video_id,
3368             'url': video_url,
3369             'uploader': None,
3370             'upload_date': None,
3371             'title': video_title,
3372             'ext': 'flv',
3373             'thumbnail': video_thumbnail,
3374             'description': None,
3375         }]
3376
3377
3378 class GooglePlusIE(InfoExtractor):
3379     """Information extractor for plus.google.com."""
3380
3381     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3382     IE_NAME = u'plus.google'
3383
3384     def __init__(self, downloader=None):
3385         InfoExtractor.__init__(self, downloader)
3386
3387     def report_extract_entry(self, url):
3388         """Report downloading extry"""
3389         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3390
3391     def report_date(self, upload_date):
3392         """Report downloading extry"""
3393         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3394
3395     def report_uploader(self, uploader):
3396         """Report downloading extry"""
3397         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3398
3399     def report_title(self, video_title):
3400         """Report downloading extry"""
3401         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3402
3403     def report_extract_vid_page(self, video_page):
3404         """Report information extraction."""
3405         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3406
3407     def _real_extract(self, url):
3408         # Extract id from URL
3409         mobj = re.match(self._VALID_URL, url)
3410         if mobj is None:
3411             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3412             return
3413
3414         post_url = mobj.group(0)
3415         video_id = mobj.group(2)
3416
3417         video_extension = 'flv'
3418
3419         # Step 1, Retrieve post webpage to extract further information
3420         self.report_extract_entry(post_url)
3421         request = compat_urllib_request.Request(post_url)
3422         try:
3423             webpage = compat_urllib_request.urlopen(request).read()
3424         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3426             return
3427
3428         # Extract update date
3429         upload_date = None
3430         pattern = 'title="Timestamp">(.*?)</a>'
3431         mobj = re.search(pattern, webpage)
3432         if mobj:
3433             upload_date = mobj.group(1)
3434             # Convert timestring to a format suitable for filename
3435             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3436             upload_date = upload_date.strftime('%Y%m%d')
3437         self.report_date(upload_date)
3438
3439         # Extract uploader
3440         uploader = None
3441         pattern = r'rel\="author".*?>(.*?)</a>'
3442         mobj = re.search(pattern, webpage)
3443         if mobj:
3444             uploader = mobj.group(1)
3445         self.report_uploader(uploader)
3446
3447         # Extract title
3448         # Get the first line for title
3449         video_title = u'NA'
3450         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3451         mobj = re.search(pattern, webpage)
3452         if mobj:
3453             video_title = mobj.group(1)
3454         self.report_title(video_title)
3455
3456         # Step 2, Stimulate clicking the image box to launch video
3457         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3458         mobj = re.search(pattern, webpage)
3459         if mobj is None:
3460             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3461
3462         video_page = mobj.group(1)
3463         request = compat_urllib_request.Request(video_page)
3464         try:
3465             webpage = compat_urllib_request.urlopen(request).read()
3466         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3467             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3468             return
3469         self.report_extract_vid_page(video_page)
3470
3471
3472         # Extract video links on video page
3473         """Extract video links of all sizes"""
3474         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3475         mobj = re.findall(pattern, webpage)
3476         if len(mobj) == 0:
3477             self._downloader.trouble(u'ERROR: unable to extract video links')
3478
3479         # Sort in resolution
3480         links = sorted(mobj)
3481
3482         # Choose the lowest of the sort, i.e. highest resolution
3483         video_url = links[-1]
3484         # Only get the url. The resolution part in the tuple has no use anymore
3485         video_url = video_url[-1]
3486         # Treat escaped \u0026 style hex
3487         video_url = unicode(video_url, "unicode_escape")
3488
3489
3490         return [{
3491             'id':       video_id.decode('utf-8'),
3492             'url':      video_url,
3493             'uploader': uploader.decode('utf-8'),
3494             'upload_date':  upload_date.decode('utf-8'),
3495             'title':    video_title.decode('utf-8'),
3496             'ext':      video_extension.decode('utf-8'),
3497         }]
3498
3499 class NBAIE(InfoExtractor):
3500     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3501     IE_NAME = u'nba'
3502
3503     def report_extraction(self, video_id):
3504         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3505
3506     def _real_extract(self, url):
3507         mobj = re.match(self._VALID_URL, url)
3508         if mobj is None:
3509             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3510             return
3511
3512         video_id = mobj.group(1)
3513         if video_id.endswith('/index.html'):
3514             video_id = video_id[:-len('/index.html')]
3515
3516         self.report_extraction(video_id)
3517         try:
3518             urlh = compat_urllib_request.urlopen(url)
3519             webpage_bytes = urlh.read()
3520             webpage = webpage_bytes.decode('utf-8', 'ignore')
3521         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3522             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3523             return
3524
3525         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3526         def _findProp(rexp, default=None):
3527             m = re.search(rexp, webpage)
3528             if m:
3529                 return unescapeHTML(m.group(1))
3530             else:
3531                 return default
3532
3533         shortened_video_id = video_id.rpartition('/')[2]
3534         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3535         info = {
3536             'id': shortened_video_id,
3537             'url': video_url,
3538             'ext': 'mp4',
3539             'title': title,
3540             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3541             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3542         }
3543         return [info]
3544
3545 class JustinTVIE(InfoExtractor):
3546     """Information extractor for justin.tv and twitch.tv"""
3547     # TODO: One broadcast may be split into multiple videos. The key
3548     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3549     # starts at 1 and increases. Can we treat all parts as one video?
3550
3551     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3552         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3553     _JUSTIN_PAGE_LIMIT = 100
3554     IE_NAME = u'justin.tv'
3555
3556     def report_extraction(self, file_id):
3557         """Report information extraction."""
3558         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3559
3560     def report_download_page(self, channel, offset):
3561         """Report attempt to download a single page of videos."""
3562         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3563                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3564
3565     # Return count of items, list of *valid* items
3566     def _parse_page(self, url):
3567         try:
3568             urlh = compat_urllib_request.urlopen(url)
3569             webpage_bytes = urlh.read()
3570             webpage = webpage_bytes.decode('utf-8', 'ignore')
3571         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3572             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3573             return
3574         
3575         response = json.loads(webpage)
3576         info = []
3577         for clip in response:
3578             video_url = clip['video_file_url']
3579             if video_url:
3580                 video_extension = os.path.splitext(video_url)[1][1:]
3581                 video_date = re.sub('-', '', clip['created_on'][:10])
3582                 info.append({
3583                     'id': clip['id'],
3584                     'url': video_url,
3585                     'title': clip['title'],
3586                     'uploader': clip.get('user_id', clip.get('channel_id')),
3587                     'upload_date': video_date,
3588                     'ext': video_extension,
3589                 })
3590         return (len(response), info)
3591
3592     def _real_extract(self, url):
3593         mobj = re.match(self._VALID_URL, url)
3594         if mobj is None:
3595             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3596             return
3597         
3598         api = 'http://api.justin.tv'
3599         video_id = mobj.group(mobj.lastindex)
3600         paged = False
3601         if mobj.lastindex == 1:
3602             paged = True
3603             api += '/channel/archives/%s.json'
3604         else:
3605             api += '/clip/show/%s.json'
3606         api = api % (video_id,)
3607         
3608         self.report_extraction(video_id)
3609         
3610         info = []
3611         offset = 0
3612         limit = self._JUSTIN_PAGE_LIMIT
3613         while True:
3614             if paged:
3615                 self.report_download_page(video_id, offset)
3616             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3617             page_count, page_info = self._parse_page(page_url)
3618             info.extend(page_info)
3619             if not paged or page_count != limit:
3620                 break
3621             offset += limit
3622         return info