Convert all tabs to 4 spaces (PEP8)
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import netrc
6 import os
7 import re
8 import socket
9 import time
10 import email.utils
11 import xml.etree.ElementTree
12 import random
13 import math
14
15 from utils import *
16
17
18 class InfoExtractor(object):
19     """Information Extractor class.
20
21     Information extractors are the classes that, given a URL, extract
22     information about the video (or videos) the URL refers to. This
23     information includes the real video URL, the video title, author and
24     others. The information is stored in a dictionary which is then 
25     passed to the FileDownloader. The FileDownloader processes this
26     information possibly downloading the video to the file system, among
27     other possible outcomes.
28
29     The dictionaries must include the following fields:
30
31     id:             Video identifier.
32     url:            Final video URL.
33     uploader:       Nickname of the video uploader, unescaped.
34     upload_date:    Video upload date (YYYYMMDD).
35     title:          Video title, unescaped.
36     ext:            Video filename extension.
37
38     The following fields are optional:
39
40     format:         The video format, defaults to ext (used for --get-format)
41     thumbnail:      Full URL to a video thumbnail image.
42     description:    One-line video description.
43     player_url:     SWF Player URL (used for rtmpdump).
44     subtitles:      The .srt file contents.
45     urlhandle:      [internal] The urlHandle to be used to download the file,
46                     like returned by urllib.request.urlopen
47
48     The fields should all be Unicode strings.
49
50     Subclasses of this one should re-define the _real_initialize() and
51     _real_extract() methods and define a _VALID_URL regexp.
52     Probably, they should also be added to the list of extractors.
53
54     _real_extract() must return a *list* of information dictionaries as
55     described above.
56
57     Finally, the _WORKING attribute should be set to False for broken IEs
58     in order to warn the users and skip the tests.
59     """
60
61     _ready = False
62     _downloader = None
63     _WORKING = True
64
65     def __init__(self, downloader=None):
66         """Constructor. Receives an optional downloader."""
67         self._ready = False
68         self.set_downloader(downloader)
69
70     def suitable(self, url):
71         """Receives a URL and returns True if suitable for this IE."""
72         return re.match(self._VALID_URL, url) is not None
73
74     def working(self):
75         """Getter method for _WORKING."""
76         return self._WORKING
77
78     def initialize(self):
79         """Initializes an instance (authentication, etc)."""
80         if not self._ready:
81             self._real_initialize()
82             self._ready = True
83
84     def extract(self, url):
85         """Extracts URL information and returns it in list of dicts."""
86         self.initialize()
87         return self._real_extract(url)
88
89     def set_downloader(self, downloader):
90         """Sets the downloader for this IE."""
91         self._downloader = downloader
92
93     def _real_initialize(self):
94         """Real initialization process. Redefine in subclasses."""
95         pass
96
97     def _real_extract(self, url):
98         """Real extraction process. Redefine in subclasses."""
99         pass
100
101
102 class YoutubeIE(InfoExtractor):
103     """Information extractor for youtube.com."""
104
105     _VALID_URL = r"""^
106                      (
107                          (?:https?://)?                                       # http(s):// (optional)
108                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
109                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
110                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
111                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
112                          (?:                                                  # the various things that can precede the ID:
113                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
114                              |(?:                                             # or the v= param in all its forms
115                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
116                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
117                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
118                                  v=
119                              )
120                          )?                                                   # optional -> youtube.com/xxxx is OK
121                      )?                                                       # all until now is optional -> you can pass the naked ID
122                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
123                      (?(1).+)?                                                # if we found the ID, everything can follow
124                      $"""
125     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
126     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
127     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
128     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
129     _NETRC_MACHINE = 'youtube'
130     # Listed in order of quality
131     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
132     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
133     _video_extensions = {
134         '13': '3gp',
135         '17': 'mp4',
136         '18': 'mp4',
137         '22': 'mp4',
138         '37': 'mp4',
139         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
140         '43': 'webm',
141         '44': 'webm',
142         '45': 'webm',
143         '46': 'webm',
144     }
145     _video_dimensions = {
146         '5': '240x400',
147         '6': '???',
148         '13': '???',
149         '17': '144x176',
150         '18': '360x640',
151         '22': '720x1280',
152         '34': '360x640',
153         '35': '480x854',
154         '37': '1080x1920',
155         '38': '3072x4096',
156         '43': '360x640',
157         '44': '480x854',
158         '45': '720x1280',
159         '46': '1080x1920',
160     }   
161     IE_NAME = u'youtube'
162
163     def suitable(self, url):
164         """Receives a URL and returns True if suitable for this IE."""
165         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
166
167     def report_lang(self):
168         """Report attempt to set language."""
169         self._downloader.to_screen(u'[youtube] Setting language')
170
171     def report_login(self):
172         """Report attempt to log in."""
173         self._downloader.to_screen(u'[youtube] Logging in')
174
175     def report_age_confirmation(self):
176         """Report attempt to confirm age."""
177         self._downloader.to_screen(u'[youtube] Confirming age')
178
179     def report_video_webpage_download(self, video_id):
180         """Report attempt to download video webpage."""
181         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
182
183     def report_video_info_webpage_download(self, video_id):
184         """Report attempt to download video info webpage."""
185         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
186
187     def report_video_subtitles_download(self, video_id):
188         """Report attempt to download video info webpage."""
189         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
190
191     def report_information_extraction(self, video_id):
192         """Report attempt to extract video information."""
193         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
194
195     def report_unavailable_format(self, video_id, format):
196         """Report extracted video URL."""
197         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
198
199     def report_rtmp_download(self):
200         """Indicate the download will use the RTMP protocol."""
201         self._downloader.to_screen(u'[youtube] RTMP download detected')
202
203     def _closed_captions_xml_to_srt(self, xml_string):
204         srt = ''
205         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
206         # TODO parse xml instead of regex
207         for n, (start, dur_tag, dur, caption) in enumerate(texts):
208             if not dur: dur = '4'
209             start = float(start)
210             end = start + float(dur)
211             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
212             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
213             caption = unescapeHTML(caption)
214             caption = unescapeHTML(caption) # double cycle, intentional
215             srt += str(n+1) + '\n'
216             srt += start + ' --> ' + end + '\n'
217             srt += caption + '\n\n'
218         return srt
219
220     def _print_formats(self, formats):
221         print('Available formats:')
222         for x in formats:
223             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
224
225     def _real_initialize(self):
226         if self._downloader is None:
227             return
228
229         username = None
230         password = None
231         downloader_params = self._downloader.params
232
233         # Attempt to use provided username and password or .netrc data
234         if downloader_params.get('username', None) is not None:
235             username = downloader_params['username']
236             password = downloader_params['password']
237         elif downloader_params.get('usenetrc', False):
238             try:
239                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
240                 if info is not None:
241                     username = info[0]
242                     password = info[2]
243                 else:
244                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
245             except (IOError, netrc.NetrcParseError) as err:
246                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
247                 return
248
249         # Set language
250         request = compat_urllib_request.Request(self._LANG_URL)
251         try:
252             self.report_lang()
253             compat_urllib_request.urlopen(request).read()
254         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
255             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
256             return
257
258         # No authentication to be performed
259         if username is None:
260             return
261
262         # Log in
263         login_form = {
264                 'current_form': 'loginForm',
265                 'next':     '/',
266                 'action_login': 'Log In',
267                 'username': username,
268                 'password': password,
269                 }
270         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
271         try:
272             self.report_login()
273             login_results = compat_urllib_request.urlopen(request).read()
274             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
275                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
276                 return
277         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
279             return
280
281         # Confirm age
282         age_form = {
283                 'next_url':     '/',
284                 'action_confirm':   'Confirm',
285                 }
286         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
287         try:
288             self.report_age_confirmation()
289             age_results = compat_urllib_request.urlopen(request).read()
290         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
292             return
293
294     def _real_extract(self, url):
295         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
296         mobj = re.search(self._NEXT_URL_RE, url)
297         if mobj:
298             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
299
300         # Extract video id from URL
301         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
302         if mobj is None:
303             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
304             return
305         video_id = mobj.group(2)
306
307         # Get video webpage
308         self.report_video_webpage_download(video_id)
309         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
310         try:
311             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
312         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
313             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
314             return
315
316         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
317
318         # Attempt to extract SWF player URL
319         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
320         if mobj is not None:
321             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
322         else:
323             player_url = None
324
325         # Get video info
326         self.report_video_info_webpage_download(video_id)
327         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
328             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
329                     % (video_id, el_type))
330             request = compat_urllib_request.Request(video_info_url)
331             try:
332                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
333                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
334                 video_info = compat_parse_qs(video_info_webpage)
335                 if 'token' in video_info:
336                     break
337             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
338                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
339                 return
340         if 'token' not in video_info:
341             if 'reason' in video_info:
342                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
343             else:
344                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
345             return
346
347         # Check for "rental" videos
348         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
349             self._downloader.trouble(u'ERROR: "rental" videos not supported')
350             return
351
352         # Start extracting information
353         self.report_information_extraction(video_id)
354
355         # uploader
356         if 'author' not in video_info:
357             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
358             return
359         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
360
361         # title
362         if 'title' not in video_info:
363             self._downloader.trouble(u'ERROR: unable to extract video title')
364             return
365         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
366
367         # thumbnail image
368         if 'thumbnail_url' not in video_info:
369             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
370             video_thumbnail = ''
371         else:   # don't panic if we can't find it
372             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
373
374         # upload date
375         upload_date = None
376         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
377         if mobj is not None:
378             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
379             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
380             for expression in format_expressions:
381                 try:
382                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
383                 except:
384                     pass
385
386         # description
387         video_description = get_element_by_id("eow-description", video_webpage)
388         if video_description:
389             video_description = clean_html(video_description)
390         else:
391             video_description = ''
392
393         # closed captions
394         video_subtitles = None
395         if self._downloader.params.get('writesubtitles', False):
396             try:
397                 self.report_video_subtitles_download(video_id)
398                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
399                 try:
400                     srt_list = compat_urllib_request.urlopen(request).read()
401                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
403                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
404                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
405                 if not srt_lang_list:
406                     raise Trouble(u'WARNING: video has no closed captions')
407                 if self._downloader.params.get('subtitleslang', False):
408                     srt_lang = self._downloader.params.get('subtitleslang')
409                 elif 'en' in srt_lang_list:
410                     srt_lang = 'en'
411                 else:
412                     srt_lang = srt_lang_list.keys()[0]
413                 if not srt_lang in srt_lang_list:
414                     raise Trouble(u'WARNING: no closed captions found in the specified language')
415                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
416                 try:
417                     srt_xml = compat_urllib_request.urlopen(request).read()
418                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
419                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
420                 if not srt_xml:
421                     raise Trouble(u'WARNING: unable to download video subtitles')
422                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
423             except Trouble as trouble:
424                 self._downloader.trouble(trouble[0])
425
426         if 'length_seconds' not in video_info:
427             self._downloader.trouble(u'WARNING: unable to extract video duration')
428             video_duration = ''
429         else:
430             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
431
432         # token
433         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
434
435         # Decide which formats to download
436         req_format = self._downloader.params.get('format', None)
437
438         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
439             self.report_rtmp_download()
440             video_url_list = [(None, video_info['conn'][0])]
441         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
442             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
443             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
444             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
445             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
446
447             format_limit = self._downloader.params.get('format_limit', None)
448             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
449             if format_limit is not None and format_limit in available_formats:
450                 format_list = available_formats[available_formats.index(format_limit):]
451             else:
452                 format_list = available_formats
453             existing_formats = [x for x in format_list if x in url_map]
454             if len(existing_formats) == 0:
455                 self._downloader.trouble(u'ERROR: no known formats available for video')
456                 return
457             if self._downloader.params.get('listformats', None):
458                 self._print_formats(existing_formats)
459                 return
460             if req_format is None or req_format == 'best':
461                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
462             elif req_format == 'worst':
463                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
464             elif req_format in ('-1', 'all'):
465                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
466             else:
467                 # Specific formats. We pick the first in a slash-delimeted sequence.
468                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
469                 req_formats = req_format.split('/')
470                 video_url_list = None
471                 for rf in req_formats:
472                     if rf in url_map:
473                         video_url_list = [(rf, url_map[rf])]
474                         break
475                 if video_url_list is None:
476                     self._downloader.trouble(u'ERROR: requested format not available')
477                     return
478         else:
479             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
480             return
481
482         results = []
483         for format_param, video_real_url in video_url_list:
484             # Extension
485             video_extension = self._video_extensions.get(format_param, 'flv')
486
487             video_format = '{} - {}'.format(format_param if format_param else video_extension,
488                                             self._video_dimensions.get(format_param, '???'))
489
490             results.append({
491                 'id':       video_id,
492                 'url':      video_real_url,
493                 'uploader': video_uploader,
494                 'upload_date':  upload_date,
495                 'title':    video_title,
496                 'ext':      video_extension,
497                 'format':   video_format,
498                 'thumbnail':    video_thumbnail,
499                 'description':  video_description,
500                 'player_url':   player_url,
501                 'subtitles':    video_subtitles,
502                 'duration':     video_duration
503             })
504         return results
505
506
507 class MetacafeIE(InfoExtractor):
508     """Information Extractor for metacafe.com."""
509
510     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
511     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
512     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
513     IE_NAME = u'metacafe'
514
515     def __init__(self, downloader=None):
516         InfoExtractor.__init__(self, downloader)
517
518     def report_disclaimer(self):
519         """Report disclaimer retrieval."""
520         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
521
522     def report_age_confirmation(self):
523         """Report attempt to confirm age."""
524         self._downloader.to_screen(u'[metacafe] Confirming age')
525
526     def report_download_webpage(self, video_id):
527         """Report webpage download."""
528         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
529
530     def report_extraction(self, video_id):
531         """Report information extraction."""
532         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
533
534     def _real_initialize(self):
535         # Retrieve disclaimer
536         request = compat_urllib_request.Request(self._DISCLAIMER)
537         try:
538             self.report_disclaimer()
539             disclaimer = compat_urllib_request.urlopen(request).read()
540         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
541             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
542             return
543
544         # Confirm age
545         disclaimer_form = {
546             'filters': '0',
547             'submit': "Continue - I'm over 18",
548             }
549         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
550         try:
551             self.report_age_confirmation()
552             disclaimer = compat_urllib_request.urlopen(request).read()
553         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
554             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
555             return
556
557     def _real_extract(self, url):
558         # Extract id and simplified title from URL
559         mobj = re.match(self._VALID_URL, url)
560         if mobj is None:
561             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
562             return
563
564         video_id = mobj.group(1)
565
566         # Check if video comes from YouTube
567         mobj2 = re.match(r'^yt-(.*)$', video_id)
568         if mobj2 is not None:
569             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
570             return
571
572         # Retrieve video webpage to extract further information
573         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
574         try:
575             self.report_download_webpage(video_id)
576             webpage = compat_urllib_request.urlopen(request).read()
577         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
578             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
579             return
580
581         # Extract URL, uploader and title from webpage
582         self.report_extraction(video_id)
583         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
584         if mobj is not None:
585             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
586             video_extension = mediaURL[-3:]
587
588             # Extract gdaKey if available
589             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
590             if mobj is None:
591                 video_url = mediaURL
592             else:
593                 gdaKey = mobj.group(1)
594                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
595         else:
596             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
597             if mobj is None:
598                 self._downloader.trouble(u'ERROR: unable to extract media URL')
599                 return
600             vardict = compat_parse_qs(mobj.group(1))
601             if 'mediaData' not in vardict:
602                 self._downloader.trouble(u'ERROR: unable to extract media URL')
603                 return
604             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
605             if mobj is None:
606                 self._downloader.trouble(u'ERROR: unable to extract media URL')
607                 return
608             mediaURL = mobj.group(1).replace('\\/', '/')
609             video_extension = mediaURL[-3:]
610             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
611
612         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
613         if mobj is None:
614             self._downloader.trouble(u'ERROR: unable to extract title')
615             return
616         video_title = mobj.group(1).decode('utf-8')
617
618         mobj = re.search(r'submitter=(.*?);', webpage)
619         if mobj is None:
620             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
621             return
622         video_uploader = mobj.group(1)
623
624         return [{
625             'id':       video_id.decode('utf-8'),
626             'url':      video_url.decode('utf-8'),
627             'uploader': video_uploader.decode('utf-8'),
628             'upload_date':  None,
629             'title':    video_title,
630             'ext':      video_extension.decode('utf-8'),
631         }]
632
633
634 class DailymotionIE(InfoExtractor):
635     """Information Extractor for Dailymotion"""
636
637     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
638     IE_NAME = u'dailymotion'
639
640     def __init__(self, downloader=None):
641         InfoExtractor.__init__(self, downloader)
642
643     def report_download_webpage(self, video_id):
644         """Report webpage download."""
645         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
646
647     def report_extraction(self, video_id):
648         """Report information extraction."""
649         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
650
651     def _real_extract(self, url):
652         # Extract id and simplified title from URL
653         mobj = re.match(self._VALID_URL, url)
654         if mobj is None:
655             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
656             return
657
658         video_id = mobj.group(1).split('_')[0].split('?')[0]
659
660         video_extension = 'mp4'
661
662         # Retrieve video webpage to extract further information
663         request = compat_urllib_request.Request(url)
664         request.add_header('Cookie', 'family_filter=off')
665         try:
666             self.report_download_webpage(video_id)
667             webpage = compat_urllib_request.urlopen(request).read()
668         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
669             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
670             return
671
672         # Extract URL, uploader and title from webpage
673         self.report_extraction(video_id)
674         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
675         if mobj is None:
676             self._downloader.trouble(u'ERROR: unable to extract media URL')
677             return
678         flashvars = compat_urllib_parse.unquote(mobj.group(1))
679
680         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
681             if key in flashvars:
682                 max_quality = key
683                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
684                 break
685         else:
686             self._downloader.trouble(u'ERROR: unable to extract video URL')
687             return
688
689         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
690         if mobj is None:
691             self._downloader.trouble(u'ERROR: unable to extract video URL')
692             return
693
694         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
695
696         # TODO: support choosing qualities
697
698         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
699         if mobj is None:
700             self._downloader.trouble(u'ERROR: unable to extract title')
701             return
702         video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
703
704         video_uploader = None
705         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
706         if mobj is None:
707             # lookin for official user
708             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
709             if mobj_official is None:
710                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
711             else:
712                 video_uploader = mobj_official.group(1)
713         else:
714             video_uploader = mobj.group(1)
715
716         video_upload_date = None
717         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
718         if mobj is not None:
719             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
720
721         return [{
722             'id':       video_id.decode('utf-8'),
723             'url':      video_url.decode('utf-8'),
724             'uploader': video_uploader.decode('utf-8'),
725             'upload_date':  video_upload_date,
726             'title':    video_title,
727             'ext':      video_extension.decode('utf-8'),
728         }]
729
730
731 class GoogleIE(InfoExtractor):
732     """Information extractor for video.google.com."""
733
734     _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
735     IE_NAME = u'video.google'
736
737     def __init__(self, downloader=None):
738         InfoExtractor.__init__(self, downloader)
739
740     def report_download_webpage(self, video_id):
741         """Report webpage download."""
742         self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
743
744     def report_extraction(self, video_id):
745         """Report information extraction."""
746         self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
747
748     def _real_extract(self, url):
749         # Extract id from URL
750         mobj = re.match(self._VALID_URL, url)
751         if mobj is None:
752             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
753             return
754
755         video_id = mobj.group(1)
756
757         video_extension = 'mp4'
758
759         # Retrieve video webpage to extract further information
760         request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
761         try:
762             self.report_download_webpage(video_id)
763             webpage = compat_urllib_request.urlopen(request).read()
764         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
765             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
766             return
767
768         # Extract URL, uploader, and title from webpage
769         self.report_extraction(video_id)
770         mobj = re.search(r"download_url:'([^']+)'", webpage)
771         if mobj is None:
772             video_extension = 'flv'
773             mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
774         if mobj is None:
775             self._downloader.trouble(u'ERROR: unable to extract media URL')
776             return
777         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
778         mediaURL = mediaURL.replace('\\x3d', '\x3d')
779         mediaURL = mediaURL.replace('\\x26', '\x26')
780
781         video_url = mediaURL
782
783         mobj = re.search(r'<title>(.*)</title>', webpage)
784         if mobj is None:
785             self._downloader.trouble(u'ERROR: unable to extract title')
786             return
787         video_title = mobj.group(1).decode('utf-8')
788
789         # Extract video description
790         mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
791         if mobj is None:
792             self._downloader.trouble(u'ERROR: unable to extract video description')
793             return
794         video_description = mobj.group(1).decode('utf-8')
795         if not video_description:
796             video_description = 'No description available.'
797
798         # Extract video thumbnail
799         if self._downloader.params.get('forcethumbnail', False):
800             request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
801             try:
802                 webpage = compat_urllib_request.urlopen(request).read()
803             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
804                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
805                 return
806             mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
807             if mobj is None:
808                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
809                 return
810             video_thumbnail = mobj.group(1)
811         else:   # we need something to pass to process_info
812             video_thumbnail = ''
813
814         return [{
815             'id':       video_id.decode('utf-8'),
816             'url':      video_url.decode('utf-8'),
817             'uploader': None,
818             'upload_date':  None,
819             'title':    video_title,
820             'ext':      video_extension.decode('utf-8'),
821         }]
822
823
824 class PhotobucketIE(InfoExtractor):
825     """Information extractor for photobucket.com."""
826
827     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
828     IE_NAME = u'photobucket'
829
830     def __init__(self, downloader=None):
831         InfoExtractor.__init__(self, downloader)
832
833     def report_download_webpage(self, video_id):
834         """Report webpage download."""
835         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
836
837     def report_extraction(self, video_id):
838         """Report information extraction."""
839         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
840
841     def _real_extract(self, url):
842         # Extract id from URL
843         mobj = re.match(self._VALID_URL, url)
844         if mobj is None:
845             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
846             return
847
848         video_id = mobj.group(1)
849
850         video_extension = 'flv'
851
852         # Retrieve video webpage to extract further information
853         request = compat_urllib_request.Request(url)
854         try:
855             self.report_download_webpage(video_id)
856             webpage = compat_urllib_request.urlopen(request).read()
857         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
858             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
859             return
860
861         # Extract URL, uploader, and title from webpage
862         self.report_extraction(video_id)
863         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
864         if mobj is None:
865             self._downloader.trouble(u'ERROR: unable to extract media URL')
866             return
867         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
868
869         video_url = mediaURL
870
871         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
872         if mobj is None:
873             self._downloader.trouble(u'ERROR: unable to extract title')
874             return
875         video_title = mobj.group(1).decode('utf-8')
876
877         video_uploader = mobj.group(2).decode('utf-8')
878
879         return [{
880             'id':       video_id.decode('utf-8'),
881             'url':      video_url.decode('utf-8'),
882             'uploader': video_uploader,
883             'upload_date':  None,
884             'title':    video_title,
885             'ext':      video_extension.decode('utf-8'),
886         }]
887
888
889 class YahooIE(InfoExtractor):
890     """Information extractor for video.yahoo.com."""
891
892     # _VALID_URL matches all Yahoo! Video URLs
893     # _VPAGE_URL matches only the extractable '/watch/' URLs
894     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
895     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
896     IE_NAME = u'video.yahoo'
897
898     def __init__(self, downloader=None):
899         InfoExtractor.__init__(self, downloader)
900
901     def report_download_webpage(self, video_id):
902         """Report webpage download."""
903         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
904
905     def report_extraction(self, video_id):
906         """Report information extraction."""
907         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
908
909     def _real_extract(self, url, new_video=True):
910         # Extract ID from URL
911         mobj = re.match(self._VALID_URL, url)
912         if mobj is None:
913             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
914             return
915
916         video_id = mobj.group(2)
917         video_extension = 'flv'
918
919         # Rewrite valid but non-extractable URLs as
920         # extractable English language /watch/ URLs
921         if re.match(self._VPAGE_URL, url) is None:
922             request = compat_urllib_request.Request(url)
923             try:
924                 webpage = compat_urllib_request.urlopen(request).read()
925             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
926                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
927                 return
928
929             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
930             if mobj is None:
931                 self._downloader.trouble(u'ERROR: Unable to extract id field')
932                 return
933             yahoo_id = mobj.group(1)
934
935             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
936             if mobj is None:
937                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
938                 return
939             yahoo_vid = mobj.group(1)
940
941             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
942             return self._real_extract(url, new_video=False)
943
944         # Retrieve video webpage to extract further information
945         request = compat_urllib_request.Request(url)
946         try:
947             self.report_download_webpage(video_id)
948             webpage = compat_urllib_request.urlopen(request).read()
949         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
950             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951             return
952
953         # Extract uploader and title from webpage
954         self.report_extraction(video_id)
955         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
956         if mobj is None:
957             self._downloader.trouble(u'ERROR: unable to extract video title')
958             return
959         video_title = mobj.group(1).decode('utf-8')
960
961         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
962         if mobj is None:
963             self._downloader.trouble(u'ERROR: unable to extract video uploader')
964             return
965         video_uploader = mobj.group(1).decode('utf-8')
966
967         # Extract video thumbnail
968         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
969         if mobj is None:
970             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
971             return
972         video_thumbnail = mobj.group(1).decode('utf-8')
973
974         # Extract video description
975         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
976         if mobj is None:
977             self._downloader.trouble(u'ERROR: unable to extract video description')
978             return
979         video_description = mobj.group(1).decode('utf-8')
980         if not video_description:
981             video_description = 'No description available.'
982
983         # Extract video height and width
984         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
985         if mobj is None:
986             self._downloader.trouble(u'ERROR: unable to extract video height')
987             return
988         yv_video_height = mobj.group(1)
989
990         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
991         if mobj is None:
992             self._downloader.trouble(u'ERROR: unable to extract video width')
993             return
994         yv_video_width = mobj.group(1)
995
996         # Retrieve video playlist to extract media URL
997         # I'm not completely sure what all these options are, but we
998         # seem to need most of them, otherwise the server sends a 401.
999         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1000         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1001         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1002                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1003                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1004         try:
1005             self.report_download_webpage(video_id)
1006             webpage = compat_urllib_request.urlopen(request).read()
1007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009             return
1010
1011         # Extract media URL from playlist XML
1012         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1013         if mobj is None:
1014             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1015             return
1016         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1017         video_url = unescapeHTML(video_url)
1018
1019         return [{
1020             'id':       video_id.decode('utf-8'),
1021             'url':      video_url,
1022             'uploader': video_uploader,
1023             'upload_date':  None,
1024             'title':    video_title,
1025             'ext':      video_extension.decode('utf-8'),
1026             'thumbnail':    video_thumbnail.decode('utf-8'),
1027             'description':  video_description,
1028         }]
1029
1030
1031 class VimeoIE(InfoExtractor):
1032     """Information extractor for vimeo.com."""
1033
1034     # _VALID_URL matches Vimeo URLs
1035     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1036     IE_NAME = u'vimeo'
1037
1038     def __init__(self, downloader=None):
1039         InfoExtractor.__init__(self, downloader)
1040
1041     def report_download_webpage(self, video_id):
1042         """Report webpage download."""
1043         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1044
1045     def report_extraction(self, video_id):
1046         """Report information extraction."""
1047         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1048
1049     def _real_extract(self, url, new_video=True):
1050         # Extract ID from URL
1051         mobj = re.match(self._VALID_URL, url)
1052         if mobj is None:
1053             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1054             return
1055
1056         video_id = mobj.group(1)
1057
1058         # Retrieve video webpage to extract further information
1059         request = compat_urllib_request.Request(url, None, std_headers)
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Now we begin extracting as much information as we can from what we
1068         # retrieved. First we extract the information common to all extractors,
1069         # and latter we extract those that are Vimeo specific.
1070         self.report_extraction(video_id)
1071
1072         # Extract the config JSON
1073         config = webpage.split(' = {config:')[1].split(',assets:')[0]
1074         try:
1075             config = json.loads(config)
1076         except:
1077             self._downloader.trouble(u'ERROR: unable to extract info section')
1078             return
1079         
1080         # Extract title
1081         video_title = config["video"]["title"]
1082
1083         # Extract uploader
1084         video_uploader = config["video"]["owner"]["name"]
1085
1086         # Extract video thumbnail
1087         video_thumbnail = config["video"]["thumbnail"]
1088
1089         # Extract video description
1090         video_description = get_element_by_id("description", webpage.decode('utf8'))
1091         if video_description: video_description = clean_html(video_description)
1092         else: video_description = ''
1093
1094         # Extract upload date
1095         video_upload_date = None
1096         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1097         if mobj is not None:
1098             video_upload_date = mobj.group(1)
1099
1100         # Vimeo specific: extract request signature and timestamp
1101         sig = config['request']['signature']
1102         timestamp = config['request']['timestamp']
1103
1104         # Vimeo specific: extract video codec and quality information
1105         # First consider quality, then codecs, then take everything
1106         # TODO bind to format param
1107         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1108         files = { 'hd': [], 'sd': [], 'other': []}
1109         for codec_name, codec_extension in codecs:
1110             if codec_name in config["video"]["files"]:
1111                 if 'hd' in config["video"]["files"][codec_name]:
1112                     files['hd'].append((codec_name, codec_extension, 'hd'))
1113                 elif 'sd' in config["video"]["files"][codec_name]:
1114                     files['sd'].append((codec_name, codec_extension, 'sd'))
1115                 else:
1116                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1117
1118         for quality in ('hd', 'sd', 'other'):
1119             if len(files[quality]) > 0:
1120                 video_quality = files[quality][0][2]
1121                 video_codec = files[quality][0][0]
1122                 video_extension = files[quality][0][1]
1123                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1124                 break
1125         else:
1126             self._downloader.trouble(u'ERROR: no known codec found')
1127             return
1128
1129         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1130                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1131
1132         return [{
1133             'id':       video_id,
1134             'url':      video_url,
1135             'uploader': video_uploader,
1136             'upload_date':  video_upload_date,
1137             'title':    video_title,
1138             'ext':      video_extension,
1139             'thumbnail':    video_thumbnail,
1140             'description':  video_description,
1141         }]
1142
1143
1144 class ArteTvIE(InfoExtractor):
1145     """arte.tv information extractor."""
1146
1147     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1148     _LIVE_URL = r'index-[0-9]+\.html$'
1149
1150     IE_NAME = u'arte.tv'
1151
1152     def __init__(self, downloader=None):
1153         InfoExtractor.__init__(self, downloader)
1154
1155     def report_download_webpage(self, video_id):
1156         """Report webpage download."""
1157         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1158
1159     def report_extraction(self, video_id):
1160         """Report information extraction."""
1161         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1162
1163     def fetch_webpage(self, url):
1164         self._downloader.increment_downloads()
1165         request = compat_urllib_request.Request(url)
1166         try:
1167             self.report_download_webpage(url)
1168             webpage = compat_urllib_request.urlopen(request).read()
1169         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1170             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1171             return
1172         except ValueError as err:
1173             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174             return
1175         return webpage
1176
1177     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1178         page = self.fetch_webpage(url)
1179         mobj = re.search(regex, page, regexFlags)
1180         info = {}
1181
1182         if mobj is None:
1183             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184             return
1185
1186         for (i, key, err) in matchTuples:
1187             if mobj.group(i) is None:
1188                 self._downloader.trouble(err)
1189                 return
1190             else:
1191                 info[key] = mobj.group(i)
1192
1193         return info
1194
1195     def extractLiveStream(self, url):
1196         video_lang = url.split('/')[-4]
1197         info = self.grep_webpage(
1198             url,
1199             r'src="(.*?/videothek_js.*?\.js)',
1200             0,
1201             [
1202                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1203             ]
1204         )
1205         http_host = url.split('/')[2]
1206         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1207         info = self.grep_webpage(
1208             next_url,
1209             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1210                 '(http://.*?\.swf).*?' +
1211                 '(rtmp://.*?)\'',
1212             re.DOTALL,
1213             [
1214                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1215                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1216                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1217             ]
1218         )
1219         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1220
1221     def extractPlus7Stream(self, url):
1222         video_lang = url.split('/')[-3]
1223         info = self.grep_webpage(
1224             url,
1225             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1226             0,
1227             [
1228                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1229             ]
1230         )
1231         next_url = compat_urllib_parse.unquote(info.get('url'))
1232         info = self.grep_webpage(
1233             next_url,
1234             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1235             0,
1236             [
1237                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1238             ]
1239         )
1240         next_url = compat_urllib_parse.unquote(info.get('url'))
1241
1242         info = self.grep_webpage(
1243             next_url,
1244             r'<video id="(.*?)".*?>.*?' +
1245                 '<name>(.*?)</name>.*?' +
1246                 '<dateVideo>(.*?)</dateVideo>.*?' +
1247                 '<url quality="hd">(.*?)</url>',
1248             re.DOTALL,
1249             [
1250                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1251                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1252                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1253                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1254             ]
1255         )
1256
1257         return {
1258             'id':           info.get('id'),
1259             'url':          compat_urllib_parse.unquote(info.get('url')),
1260             'uploader':     u'arte.tv',
1261             'upload_date':  info.get('date'),
1262             'title':        info.get('title'),
1263             'ext':          u'mp4',
1264             'format':       u'NA',
1265             'player_url':   None,
1266         }
1267
1268     def _real_extract(self, url):
1269         video_id = url.split('/')[-1]
1270         self.report_extraction(video_id)
1271
1272         if re.search(self._LIVE_URL, video_id) is not None:
1273             self.extractLiveStream(url)
1274             return
1275         else:
1276             info = self.extractPlus7Stream(url)
1277
1278         return [info]
1279
1280
1281 class GenericIE(InfoExtractor):
1282     """Generic last-resort information extractor."""
1283
1284     _VALID_URL = r'.*'
1285     IE_NAME = u'generic'
1286
1287     def __init__(self, downloader=None):
1288         InfoExtractor.__init__(self, downloader)
1289
1290     def report_download_webpage(self, video_id):
1291         """Report webpage download."""
1292         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1293         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1294
1295     def report_extraction(self, video_id):
1296         """Report information extraction."""
1297         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1298
1299     def report_following_redirect(self, new_url):
1300         """Report information extraction."""
1301         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1302         
1303     def _test_redirect(self, url):
1304         """Check if it is a redirect, like url shorteners, in case restart chain."""
1305         class HeadRequest(compat_urllib_request.Request):
1306             def get_method(self):
1307                 return "HEAD"
1308
1309         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1310             """
1311             Subclass the HTTPRedirectHandler to make it use our 
1312             HeadRequest also on the redirected URL
1313             """
1314             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1315                 if code in (301, 302, 303, 307):
1316                     newurl = newurl.replace(' ', '%20') 
1317                     newheaders = dict((k,v) for k,v in req.headers.items()
1318                                       if k.lower() not in ("content-length", "content-type"))
1319                     return HeadRequest(newurl, 
1320                                        headers=newheaders,
1321                                        origin_req_host=req.get_origin_req_host(), 
1322                                        unverifiable=True) 
1323                 else: 
1324                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1325
1326         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1327             """
1328             Fallback to GET if HEAD is not allowed (405 HTTP error)
1329             """
1330             def http_error_405(self, req, fp, code, msg, headers): 
1331                 fp.read()
1332                 fp.close()
1333
1334                 newheaders = dict((k,v) for k,v in req.headers.items()
1335                                   if k.lower() not in ("content-length", "content-type"))
1336                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1337                                                  headers=newheaders, 
1338                                                  origin_req_host=req.get_origin_req_host(), 
1339                                                  unverifiable=True))
1340
1341         # Build our opener
1342         opener = compat_urllib_request.OpenerDirector() 
1343         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1344                         HTTPMethodFallback, HEADRedirectHandler,
1345                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1346             opener.add_handler(handler())
1347
1348         response = opener.open(HeadRequest(url))
1349         new_url = response.geturl()
1350
1351         if url == new_url:
1352             return False
1353
1354         self.report_following_redirect(new_url)
1355         self._downloader.download([new_url])
1356         return True
1357
1358     def _real_extract(self, url):
1359         if self._test_redirect(url): return
1360
1361         video_id = url.split('/')[-1]
1362         request = compat_urllib_request.Request(url)
1363         try:
1364             self.report_download_webpage(video_id)
1365             webpage = compat_urllib_request.urlopen(request).read()
1366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1367             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1368             return
1369         except ValueError as err:
1370             # since this is the last-resort InfoExtractor, if
1371             # this error is thrown, it'll be thrown here
1372             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1373             return
1374
1375         self.report_extraction(video_id)
1376         # Start with something easy: JW Player in SWFObject
1377         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1378         if mobj is None:
1379             # Broaden the search a little bit
1380             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1381         if mobj is None:
1382             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383             return
1384
1385         # It's possible that one of the regexes
1386         # matched, but returned an empty group:
1387         if mobj.group(1) is None:
1388             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1389             return
1390
1391         video_url = compat_urllib_parse.unquote(mobj.group(1))
1392         video_id = os.path.basename(video_url)
1393
1394         # here's a fun little line of code for you:
1395         video_extension = os.path.splitext(video_id)[1][1:]
1396         video_id = os.path.splitext(video_id)[0]
1397
1398         # it's tempting to parse this further, but you would
1399         # have to take into account all the variations like
1400         #   Video Title - Site Name
1401         #   Site Name | Video Title
1402         #   Video Title - Tagline | Site Name
1403         # and so on and so forth; it's just not practical
1404         mobj = re.search(r'<title>(.*)</title>', webpage)
1405         if mobj is None:
1406             self._downloader.trouble(u'ERROR: unable to extract title')
1407             return
1408         video_title = mobj.group(1).decode('utf-8')
1409
1410         # video uploader is domain name
1411         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1412         if mobj is None:
1413             self._downloader.trouble(u'ERROR: unable to extract title')
1414             return
1415         video_uploader = mobj.group(1).decode('utf-8')
1416
1417         return [{
1418             'id':       video_id.decode('utf-8'),
1419             'url':      video_url.decode('utf-8'),
1420             'uploader': video_uploader,
1421             'upload_date':  None,
1422             'title':    video_title,
1423             'ext':      video_extension.decode('utf-8'),
1424         }]
1425
1426
1427 class YoutubeSearchIE(InfoExtractor):
1428     """Information Extractor for YouTube search queries."""
1429     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1430     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1431     _max_youtube_results = 1000
1432     IE_NAME = u'youtube:search'
1433
1434     def __init__(self, downloader=None):
1435         InfoExtractor.__init__(self, downloader)
1436
1437     def report_download_page(self, query, pagenum):
1438         """Report attempt to download search page with given number."""
1439         query = query.decode(preferredencoding())
1440         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1441
1442     def _real_extract(self, query):
1443         mobj = re.match(self._VALID_URL, query)
1444         if mobj is None:
1445             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446             return
1447
1448         prefix, query = query.split(':')
1449         prefix = prefix[8:]
1450         query = query.encode('utf-8')
1451         if prefix == '':
1452             self._download_n_results(query, 1)
1453             return
1454         elif prefix == 'all':
1455             self._download_n_results(query, self._max_youtube_results)
1456             return
1457         else:
1458             try:
1459                 n = int(prefix)
1460                 if n <= 0:
1461                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462                     return
1463                 elif n > self._max_youtube_results:
1464                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1465                     n = self._max_youtube_results
1466                 self._download_n_results(query, n)
1467                 return
1468             except ValueError: # parsing prefix as integer fails
1469                 self._download_n_results(query, 1)
1470                 return
1471
1472     def _download_n_results(self, query, n):
1473         """Downloads a specified number of results for a query"""
1474
1475         video_ids = []
1476         pagenum = 0
1477         limit = n
1478
1479         while (50 * pagenum) < limit:
1480             self.report_download_page(query, pagenum+1)
1481             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1482             request = compat_urllib_request.Request(result_url)
1483             try:
1484                 data = compat_urllib_request.urlopen(request).read()
1485             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1486                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1487                 return
1488             api_response = json.loads(data)['data']
1489
1490             new_ids = list(video['id'] for video in api_response['items'])
1491             video_ids += new_ids
1492
1493             limit = min(n, api_response['totalItems'])
1494             pagenum += 1
1495
1496         if len(video_ids) > n:
1497             video_ids = video_ids[:n]
1498         for id in video_ids:
1499             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1500         return
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504     """Information Extractor for Google Video search queries."""
1505     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509     _max_google_results = 1000
1510     IE_NAME = u'video.google:search'
1511
1512     def __init__(self, downloader=None):
1513         InfoExtractor.__init__(self, downloader)
1514
1515     def report_download_page(self, query, pagenum):
1516         """Report attempt to download playlist page with given number."""
1517         query = query.decode(preferredencoding())
1518         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1519
1520     def _real_extract(self, query):
1521         mobj = re.match(self._VALID_URL, query)
1522         if mobj is None:
1523             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1524             return
1525
1526         prefix, query = query.split(':')
1527         prefix = prefix[8:]
1528         query = query.encode('utf-8')
1529         if prefix == '':
1530             self._download_n_results(query, 1)
1531             return
1532         elif prefix == 'all':
1533             self._download_n_results(query, self._max_google_results)
1534             return
1535         else:
1536             try:
1537                 n = int(prefix)
1538                 if n <= 0:
1539                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1540                     return
1541                 elif n > self._max_google_results:
1542                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1543                     n = self._max_google_results
1544                 self._download_n_results(query, n)
1545                 return
1546             except ValueError: # parsing prefix as integer fails
1547                 self._download_n_results(query, 1)
1548                 return
1549
1550     def _download_n_results(self, query, n):
1551         """Downloads a specified number of results for a query"""
1552
1553         video_ids = []
1554         pagenum = 0
1555
1556         while True:
1557             self.report_download_page(query, pagenum)
1558             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1559             request = compat_urllib_request.Request(result_url)
1560             try:
1561                 page = compat_urllib_request.urlopen(request).read()
1562             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1563                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1564                 return
1565
1566             # Extract video identifiers
1567             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1568                 video_id = mobj.group(1)
1569                 if video_id not in video_ids:
1570                     video_ids.append(video_id)
1571                     if len(video_ids) == n:
1572                         # Specified n videos reached
1573                         for id in video_ids:
1574                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575                         return
1576
1577             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578                 for id in video_ids:
1579                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580                 return
1581
1582             pagenum = pagenum + 1
1583
1584
1585 class YahooSearchIE(InfoExtractor):
1586     """Information Extractor for Yahoo! Video search queries."""
1587     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1588     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1589     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1590     _MORE_PAGES_INDICATOR = r'\s*Next'
1591     _max_yahoo_results = 1000
1592     IE_NAME = u'video.yahoo:search'
1593
1594     def __init__(self, downloader=None):
1595         InfoExtractor.__init__(self, downloader)
1596
1597     def report_download_page(self, query, pagenum):
1598         """Report attempt to download playlist page with given number."""
1599         query = query.decode(preferredencoding())
1600         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1601
1602     def _real_extract(self, query):
1603         mobj = re.match(self._VALID_URL, query)
1604         if mobj is None:
1605             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1606             return
1607
1608         prefix, query = query.split(':')
1609         prefix = prefix[8:]
1610         query = query.encode('utf-8')
1611         if prefix == '':
1612             self._download_n_results(query, 1)
1613             return
1614         elif prefix == 'all':
1615             self._download_n_results(query, self._max_yahoo_results)
1616             return
1617         else:
1618             try:
1619                 n = int(prefix)
1620                 if n <= 0:
1621                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1622                     return
1623                 elif n > self._max_yahoo_results:
1624                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1625                     n = self._max_yahoo_results
1626                 self._download_n_results(query, n)
1627                 return
1628             except ValueError: # parsing prefix as integer fails
1629                 self._download_n_results(query, 1)
1630                 return
1631
1632     def _download_n_results(self, query, n):
1633         """Downloads a specified number of results for a query"""
1634
1635         video_ids = []
1636         already_seen = set()
1637         pagenum = 1
1638
1639         while True:
1640             self.report_download_page(query, pagenum)
1641             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1642             request = compat_urllib_request.Request(result_url)
1643             try:
1644                 page = compat_urllib_request.urlopen(request).read()
1645             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1646                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1647                 return
1648
1649             # Extract video identifiers
1650             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651                 video_id = mobj.group(1)
1652                 if video_id not in already_seen:
1653                     video_ids.append(video_id)
1654                     already_seen.add(video_id)
1655                     if len(video_ids) == n:
1656                         # Specified n videos reached
1657                         for id in video_ids:
1658                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659                         return
1660
1661             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662                 for id in video_ids:
1663                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664                 return
1665
1666             pagenum = pagenum + 1
1667
1668
1669 class YoutubePlaylistIE(InfoExtractor):
1670     """Information Extractor for YouTube playlists."""
1671
1672     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1673     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1674     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1675     _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1676     IE_NAME = u'youtube:playlist'
1677
1678     def __init__(self, downloader=None):
1679         InfoExtractor.__init__(self, downloader)
1680
1681     def report_download_page(self, playlist_id, pagenum):
1682         """Report attempt to download playlist page with given number."""
1683         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1684
1685     def _real_extract(self, url):
1686         # Extract playlist id
1687         mobj = re.match(self._VALID_URL, url)
1688         if mobj is None:
1689             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690             return
1691
1692         # Single video case
1693         if mobj.group(3) is not None:
1694             self._downloader.download([mobj.group(3)])
1695             return
1696
1697         # Download playlist pages
1698         # prefix is 'p' as default for playlists but there are other types that need extra care
1699         playlist_prefix = mobj.group(1)
1700         if playlist_prefix == 'a':
1701             playlist_access = 'artist'
1702         else:
1703             playlist_prefix = 'p'
1704             playlist_access = 'view_play_list'
1705         playlist_id = mobj.group(2)
1706         video_ids = []
1707         pagenum = 1
1708
1709         while True:
1710             self.report_download_page(playlist_id, pagenum)
1711             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1712             request = compat_urllib_request.Request(url)
1713             try:
1714                 page = compat_urllib_request.urlopen(request).read()
1715             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1716                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1717                 return
1718
1719             # Extract video identifiers
1720             ids_in_page = []
1721             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1722                 if mobj.group(1) not in ids_in_page:
1723                     ids_in_page.append(mobj.group(1))
1724             video_ids.extend(ids_in_page)
1725
1726             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1727                 break
1728             pagenum = pagenum + 1
1729
1730         playliststart = self._downloader.params.get('playliststart', 1) - 1
1731         playlistend = self._downloader.params.get('playlistend', -1)
1732         if playlistend == -1:
1733             video_ids = video_ids[playliststart:]
1734         else:
1735             video_ids = video_ids[playliststart:playlistend]
1736
1737         for id in video_ids:
1738             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739         return
1740
1741
1742 class YoutubeChannelIE(InfoExtractor):
1743     """Information Extractor for YouTube channels."""
1744
1745     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747     _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1748     IE_NAME = u'youtube:channel'
1749
1750     def report_download_page(self, channel_id, pagenum):
1751         """Report attempt to download channel page with given number."""
1752         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753
1754     def _real_extract(self, url):
1755         # Extract channel id
1756         mobj = re.match(self._VALID_URL, url)
1757         if mobj is None:
1758             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759             return
1760
1761         # Download channel pages
1762         channel_id = mobj.group(1)
1763         video_ids = []
1764         pagenum = 1
1765
1766         while True:
1767             self.report_download_page(channel_id, pagenum)
1768             url = self._TEMPLATE_URL % (channel_id, pagenum)
1769             request = compat_urllib_request.Request(url)
1770             try:
1771                 page = compat_urllib_request.urlopen(request).read()
1772             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1774                 return
1775
1776             # Extract video identifiers
1777             ids_in_page = []
1778             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779                 if mobj.group(1) not in ids_in_page:
1780                     ids_in_page.append(mobj.group(1))
1781             video_ids.extend(ids_in_page)
1782
1783             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1784                 break
1785             pagenum = pagenum + 1
1786
1787         for id in video_ids:
1788             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1789         return
1790
1791
1792 class YoutubeUserIE(InfoExtractor):
1793     """Information Extractor for YouTube users."""
1794
1795     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1796     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1797     _GDATA_PAGE_SIZE = 50
1798     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1799     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1800     IE_NAME = u'youtube:user'
1801
1802     def __init__(self, downloader=None):
1803         InfoExtractor.__init__(self, downloader)
1804
1805     def report_download_page(self, username, start_index):
1806         """Report attempt to download user page."""
1807         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1808                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1809
1810     def _real_extract(self, url):
1811         # Extract username
1812         mobj = re.match(self._VALID_URL, url)
1813         if mobj is None:
1814             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815             return
1816
1817         username = mobj.group(1)
1818
1819         # Download video ids using YouTube Data API. Result size per
1820         # query is limited (currently to 50 videos) so we need to query
1821         # page by page until there are no video ids - it means we got
1822         # all of them.
1823
1824         video_ids = []
1825         pagenum = 0
1826
1827         while True:
1828             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1829             self.report_download_page(username, start_index)
1830
1831             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1832
1833             try:
1834                 page = compat_urllib_request.urlopen(request).read()
1835             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1837                 return
1838
1839             # Extract video identifiers
1840             ids_in_page = []
1841
1842             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1843                 if mobj.group(1) not in ids_in_page:
1844                     ids_in_page.append(mobj.group(1))
1845
1846             video_ids.extend(ids_in_page)
1847
1848             # A little optimization - if current page is not
1849             # "full", ie. does not contain PAGE_SIZE video ids then
1850             # we can assume that this page is the last one - there
1851             # are no more ids on further pages - no need to query
1852             # again.
1853
1854             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1855                 break
1856
1857             pagenum += 1
1858
1859         all_ids_count = len(video_ids)
1860         playliststart = self._downloader.params.get('playliststart', 1) - 1
1861         playlistend = self._downloader.params.get('playlistend', -1)
1862
1863         if playlistend == -1:
1864             video_ids = video_ids[playliststart:]
1865         else:
1866             video_ids = video_ids[playliststart:playlistend]
1867
1868         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1869                 (username, all_ids_count, len(video_ids)))
1870
1871         for video_id in video_ids:
1872             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1873
1874
1875 class BlipTVUserIE(InfoExtractor):
1876     """Information Extractor for blip.tv users."""
1877
1878     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1879     _PAGE_SIZE = 12
1880     IE_NAME = u'blip.tv:user'
1881
1882     def __init__(self, downloader=None):
1883         InfoExtractor.__init__(self, downloader)
1884
1885     def report_download_page(self, username, pagenum):
1886         """Report attempt to download user page."""
1887         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1888                 (self.IE_NAME, username, pagenum))
1889
1890     def _real_extract(self, url):
1891         # Extract username
1892         mobj = re.match(self._VALID_URL, url)
1893         if mobj is None:
1894             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895             return
1896
1897         username = mobj.group(1)
1898
1899         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1900
1901         request = compat_urllib_request.Request(url)
1902
1903         try:
1904             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1905             mobj = re.search(r'data-users-id="([^"]+)"', page)
1906             page_base = page_base % mobj.group(1)
1907         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1909             return
1910
1911
1912         # Download video ids using BlipTV Ajax calls. Result size per
1913         # query is limited (currently to 12 videos) so we need to query
1914         # page by page until there are no video ids - it means we got
1915         # all of them.
1916
1917         video_ids = []
1918         pagenum = 1
1919
1920         while True:
1921             self.report_download_page(username, pagenum)
1922
1923             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1924
1925             try:
1926                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1927             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1929                 return
1930
1931             # Extract video identifiers
1932             ids_in_page = []
1933
1934             for mobj in re.finditer(r'href="/([^"]+)"', page):
1935                 if mobj.group(1) not in ids_in_page:
1936                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1937
1938             video_ids.extend(ids_in_page)
1939
1940             # A little optimization - if current page is not
1941             # "full", ie. does not contain PAGE_SIZE video ids then
1942             # we can assume that this page is the last one - there
1943             # are no more ids on further pages - no need to query
1944             # again.
1945
1946             if len(ids_in_page) < self._PAGE_SIZE:
1947                 break
1948
1949             pagenum += 1
1950
1951         all_ids_count = len(video_ids)
1952         playliststart = self._downloader.params.get('playliststart', 1) - 1
1953         playlistend = self._downloader.params.get('playlistend', -1)
1954
1955         if playlistend == -1:
1956             video_ids = video_ids[playliststart:]
1957         else:
1958             video_ids = video_ids[playliststart:playlistend]
1959
1960         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1961                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1962
1963         for video_id in video_ids:
1964             self._downloader.download([u'http://blip.tv/'+video_id])
1965
1966
1967 class DepositFilesIE(InfoExtractor):
1968     """Information extractor for depositfiles.com"""
1969
1970     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971     IE_NAME = u'DepositFiles'
1972
1973     def __init__(self, downloader=None):
1974         InfoExtractor.__init__(self, downloader)
1975
1976     def report_download_webpage(self, file_id):
1977         """Report webpage download."""
1978         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1979
1980     def report_extraction(self, file_id):
1981         """Report information extraction."""
1982         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1983
1984     def _real_extract(self, url):
1985         file_id = url.split('/')[-1]
1986         # Rebuild url in english locale
1987         url = 'http://depositfiles.com/en/files/' + file_id
1988
1989         # Retrieve file webpage with 'Free download' button pressed
1990         free_download_indication = { 'gateway_result' : '1' }
1991         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1992         try:
1993             self.report_download_webpage(file_id)
1994             webpage = compat_urllib_request.urlopen(request).read()
1995         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1997             return
1998
1999         # Search for the real file URL
2000         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2001         if (mobj is None) or (mobj.group(1) is None):
2002             # Try to figure out reason of the error.
2003             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2004             if (mobj is not None) and (mobj.group(1) is not None):
2005                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2006                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2007             else:
2008                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2009             return
2010
2011         file_url = mobj.group(1)
2012         file_extension = os.path.splitext(file_url)[1][1:]
2013
2014         # Search for file title
2015         mobj = re.search(r'<b title="(.*?)">', webpage)
2016         if mobj is None:
2017             self._downloader.trouble(u'ERROR: unable to extract title')
2018             return
2019         file_title = mobj.group(1).decode('utf-8')
2020
2021         return [{
2022             'id':       file_id.decode('utf-8'),
2023             'url':      file_url.decode('utf-8'),
2024             'uploader': None,
2025             'upload_date':  None,
2026             'title':    file_title,
2027             'ext':      file_extension.decode('utf-8'),
2028         }]
2029
2030
2031 class FacebookIE(InfoExtractor):
2032     """Information Extractor for Facebook"""
2033
2034     _WORKING = False
2035     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2036     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2037     _NETRC_MACHINE = 'facebook'
2038     _available_formats = ['video', 'highqual', 'lowqual']
2039     _video_extensions = {
2040         'video': 'mp4',
2041         'highqual': 'mp4',
2042         'lowqual': 'mp4',
2043     }
2044     IE_NAME = u'facebook'
2045
2046     def __init__(self, downloader=None):
2047         InfoExtractor.__init__(self, downloader)
2048
2049     def _reporter(self, message):
2050         """Add header and report message."""
2051         self._downloader.to_screen(u'[facebook] %s' % message)
2052
2053     def report_login(self):
2054         """Report attempt to log in."""
2055         self._reporter(u'Logging in')
2056
2057     def report_video_webpage_download(self, video_id):
2058         """Report attempt to download video webpage."""
2059         self._reporter(u'%s: Downloading video webpage' % video_id)
2060
2061     def report_information_extraction(self, video_id):
2062         """Report attempt to extract video information."""
2063         self._reporter(u'%s: Extracting video information' % video_id)
2064
2065     def _parse_page(self, video_webpage):
2066         """Extract video information from page"""
2067         # General data
2068         data = {'title': r'\("video_title", "(.*?)"\)',
2069             'description': r'<div class="datawrap">(.*?)</div>',
2070             'owner': r'\("video_owner_name", "(.*?)"\)',
2071             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2072             }
2073         video_info = {}
2074         for piece in data.keys():
2075             mobj = re.search(data[piece], video_webpage)
2076             if mobj is not None:
2077                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2078
2079         # Video urls
2080         video_urls = {}
2081         for fmt in self._available_formats:
2082             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2083             if mobj is not None:
2084                 # URL is in a Javascript segment inside an escaped Unicode format within
2085                 # the generally utf-8 page
2086                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2087         video_info['video_urls'] = video_urls
2088
2089         return video_info
2090
2091     def _real_initialize(self):
2092         if self._downloader is None:
2093             return
2094
2095         useremail = None
2096         password = None
2097         downloader_params = self._downloader.params
2098
2099         # Attempt to use provided username and password or .netrc data
2100         if downloader_params.get('username', None) is not None:
2101             useremail = downloader_params['username']
2102             password = downloader_params['password']
2103         elif downloader_params.get('usenetrc', False):
2104             try:
2105                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2106                 if info is not None:
2107                     useremail = info[0]
2108                     password = info[2]
2109                 else:
2110                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2111             except (IOError, netrc.NetrcParseError) as err:
2112                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2113                 return
2114
2115         if useremail is None:
2116             return
2117
2118         # Log in
2119         login_form = {
2120             'email': useremail,
2121             'pass': password,
2122             'login': 'Log+In'
2123             }
2124         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2125         try:
2126             self.report_login()
2127             login_results = compat_urllib_request.urlopen(request).read()
2128             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2129                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2130                 return
2131         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2132             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2133             return
2134
2135     def _real_extract(self, url):
2136         mobj = re.match(self._VALID_URL, url)
2137         if mobj is None:
2138             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2139             return
2140         video_id = mobj.group('ID')
2141
2142         # Get video webpage
2143         self.report_video_webpage_download(video_id)
2144         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2145         try:
2146             page = compat_urllib_request.urlopen(request)
2147             video_webpage = page.read()
2148         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2149             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2150             return
2151
2152         # Start extracting information
2153         self.report_information_extraction(video_id)
2154
2155         # Extract information
2156         video_info = self._parse_page(video_webpage)
2157
2158         # uploader
2159         if 'owner' not in video_info:
2160             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2161             return
2162         video_uploader = video_info['owner']
2163
2164         # title
2165         if 'title' not in video_info:
2166             self._downloader.trouble(u'ERROR: unable to extract video title')
2167             return
2168         video_title = video_info['title']
2169         video_title = video_title.decode('utf-8')
2170
2171         # thumbnail image
2172         if 'thumbnail' not in video_info:
2173             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2174             video_thumbnail = ''
2175         else:
2176             video_thumbnail = video_info['thumbnail']
2177
2178         # upload date
2179         upload_date = None
2180         if 'upload_date' in video_info:
2181             upload_time = video_info['upload_date']
2182             timetuple = email.utils.parsedate_tz(upload_time)
2183             if timetuple is not None:
2184                 try:
2185                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2186                 except:
2187                     pass
2188
2189         # description
2190         video_description = video_info.get('description', 'No description available.')
2191
2192         url_map = video_info['video_urls']
2193         if len(url_map.keys()) > 0:
2194             # Decide which formats to download
2195             req_format = self._downloader.params.get('format', None)
2196             format_limit = self._downloader.params.get('format_limit', None)
2197
2198             if format_limit is not None and format_limit in self._available_formats:
2199                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2200             else:
2201                 format_list = self._available_formats
2202             existing_formats = [x for x in format_list if x in url_map]
2203             if len(existing_formats) == 0:
2204                 self._downloader.trouble(u'ERROR: no known formats available for video')
2205                 return
2206             if req_format is None:
2207                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2208             elif req_format == 'worst':
2209                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2210             elif req_format == '-1':
2211                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2212             else:
2213                 # Specific format
2214                 if req_format not in url_map:
2215                     self._downloader.trouble(u'ERROR: requested format not available')
2216                     return
2217                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2218
2219         results = []
2220         for format_param, video_real_url in video_url_list:
2221             # Extension
2222             video_extension = self._video_extensions.get(format_param, 'mp4')
2223
2224             results.append({
2225                 'id':       video_id.decode('utf-8'),
2226                 'url':      video_real_url.decode('utf-8'),
2227                 'uploader': video_uploader.decode('utf-8'),
2228                 'upload_date':  upload_date,
2229                 'title':    video_title,
2230                 'ext':      video_extension.decode('utf-8'),
2231                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2232                 'thumbnail':    video_thumbnail.decode('utf-8'),
2233                 'description':  video_description.decode('utf-8'),
2234             })
2235         return results
2236
2237 class BlipTVIE(InfoExtractor):
2238     """Information extractor for blip.tv"""
2239
2240     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2241     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2242     IE_NAME = u'blip.tv'
2243
2244     def report_extraction(self, file_id):
2245         """Report information extraction."""
2246         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2247
2248     def report_direct_download(self, title):
2249         """Report information extraction."""
2250         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2251
2252     def _real_extract(self, url):
2253         mobj = re.match(self._VALID_URL, url)
2254         if mobj is None:
2255             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2256             return
2257
2258         if '?' in url:
2259             cchar = '&'
2260         else:
2261             cchar = '?'
2262         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2263         request = compat_urllib_request.Request(json_url.encode('utf-8'))
2264         self.report_extraction(mobj.group(1))
2265         info = None
2266         try:
2267             urlh = compat_urllib_request.urlopen(request)
2268             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2269                 basename = url.split('/')[-1]
2270                 title,ext = os.path.splitext(basename)
2271                 title = title.decode('UTF-8')
2272                 ext = ext.replace('.', '')
2273                 self.report_direct_download(title)
2274                 info = {
2275                     'id': title,
2276                     'url': url,
2277                     'uploader': None,
2278                     'upload_date': None,
2279                     'title': title,
2280                     'ext': ext,
2281                     'urlhandle': urlh
2282                 }
2283         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2284             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2285             return
2286         if info is None: # Regular URL
2287             try:
2288                 json_code = urlh.read()
2289             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2290                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2291                 return
2292
2293             try:
2294                 json_data = json.loads(json_code)
2295                 if 'Post' in json_data:
2296                     data = json_data['Post']
2297                 else:
2298                     data = json_data
2299
2300                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2301                 video_url = data['media']['url']
2302                 umobj = re.match(self._URL_EXT, video_url)
2303                 if umobj is None:
2304                     raise ValueError('Can not determine filename extension')
2305                 ext = umobj.group(1)
2306
2307                 info = {
2308                     'id': data['item_id'],
2309                     'url': video_url,
2310                     'uploader': data['display_name'],
2311                     'upload_date': upload_date,
2312                     'title': data['title'],
2313                     'ext': ext,
2314                     'format': data['media']['mimeType'],
2315                     'thumbnail': data['thumbnailUrl'],
2316                     'description': data['description'],
2317                     'player_url': data['embedUrl']
2318                 }
2319             except (ValueError,KeyError) as err:
2320                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2321                 return
2322
2323         std_headers['User-Agent'] = 'iTunes/10.6.1'
2324         return [info]
2325
2326
2327 class MyVideoIE(InfoExtractor):
2328     """Information Extractor for myvideo.de."""
2329
2330     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2331     IE_NAME = u'myvideo'
2332
2333     def __init__(self, downloader=None):
2334         InfoExtractor.__init__(self, downloader)
2335     
2336     def report_download_webpage(self, video_id):
2337         """Report webpage download."""
2338         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2339
2340     def report_extraction(self, video_id):
2341         """Report information extraction."""
2342         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2343
2344     def _real_extract(self,url):
2345         mobj = re.match(self._VALID_URL, url)
2346         if mobj is None:
2347             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2348             return
2349
2350         video_id = mobj.group(1)
2351
2352         # Get video webpage
2353         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2354         try:
2355             self.report_download_webpage(video_id)
2356             webpage = compat_urllib_request.urlopen(request).read()
2357         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2358             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2359             return
2360
2361         self.report_extraction(video_id)
2362         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2363                  webpage)
2364         if mobj is None:
2365             self._downloader.trouble(u'ERROR: unable to extract media URL')
2366             return
2367         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2368
2369         mobj = re.search('<title>([^<]+)</title>', webpage)
2370         if mobj is None:
2371             self._downloader.trouble(u'ERROR: unable to extract title')
2372             return
2373
2374         video_title = mobj.group(1)
2375
2376         return [{
2377             'id':       video_id,
2378             'url':      video_url,
2379             'uploader': None,
2380             'upload_date':  None,
2381             'title':    video_title,
2382             'ext':      u'flv',
2383         }]
2384
2385 class ComedyCentralIE(InfoExtractor):
2386     """Information extractor for The Daily Show and Colbert Report """
2387
2388     _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2389     IE_NAME = u'comedycentral'
2390
2391     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2392
2393     _video_extensions = {
2394         '3500': 'mp4',
2395         '2200': 'mp4',
2396         '1700': 'mp4',
2397         '1200': 'mp4',
2398         '750': 'mp4',
2399         '400': 'mp4',
2400     }
2401     _video_dimensions = {
2402         '3500': '1280x720',
2403         '2200': '960x540',
2404         '1700': '768x432',
2405         '1200': '640x360',
2406         '750': '512x288',
2407         '400': '384x216',
2408     }
2409
2410     def report_extraction(self, episode_id):
2411         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2412
2413     def report_config_download(self, episode_id):
2414         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2415
2416     def report_index_download(self, episode_id):
2417         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2418
2419     def report_player_url(self, episode_id):
2420         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2421
2422
2423     def _print_formats(self, formats):
2424         print('Available formats:')
2425         for x in formats:
2426             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2427
2428
2429     def _real_extract(self, url):
2430         mobj = re.match(self._VALID_URL, url)
2431         if mobj is None:
2432             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2433             return
2434
2435         if mobj.group('shortname'):
2436             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2437                 url = u'http://www.thedailyshow.com/full-episodes/'
2438             else:
2439                 url = u'http://www.colbertnation.com/full-episodes/'
2440             mobj = re.match(self._VALID_URL, url)
2441             assert mobj is not None
2442
2443         dlNewest = not mobj.group('episode')
2444         if dlNewest:
2445             epTitle = mobj.group('showname')
2446         else:
2447             epTitle = mobj.group('episode')
2448
2449         req = compat_urllib_request.Request(url)
2450         self.report_extraction(epTitle)
2451         try:
2452             htmlHandle = compat_urllib_request.urlopen(req)
2453             html = htmlHandle.read()
2454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2456             return
2457         if dlNewest:
2458             url = htmlHandle.geturl()
2459             mobj = re.match(self._VALID_URL, url)
2460             if mobj is None:
2461                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2462                 return
2463             if mobj.group('episode') == '':
2464                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2465                 return
2466             epTitle = mobj.group('episode')
2467
2468         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2469
2470         if len(mMovieParams) == 0:
2471             # The Colbert Report embeds the information in a without
2472             # a URL prefix; so extract the alternate reference
2473             # and then add the URL prefix manually.
2474
2475             altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2476             if len(altMovieParams) == 0:
2477                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2478                 return
2479             else:
2480                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2481         
2482         playerUrl_raw = mMovieParams[0][0]
2483         self.report_player_url(epTitle)
2484         try:
2485             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2486             playerUrl = urlHandle.geturl()
2487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2489             return
2490
2491         uri = mMovieParams[0][1]
2492         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2493         self.report_index_download(epTitle)
2494         try:
2495             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2496         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2497             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2498             return
2499
2500         results = []
2501
2502         idoc = xml.etree.ElementTree.fromstring(indexXml)
2503         itemEls = idoc.findall('.//item')
2504         for itemEl in itemEls:
2505             mediaId = itemEl.findall('./guid')[0].text
2506             shortMediaId = mediaId.split(':')[-1]
2507             showId = mediaId.split(':')[-2].replace('.com', '')
2508             officialTitle = itemEl.findall('./title')[0].text
2509             officialDate = itemEl.findall('./pubDate')[0].text
2510
2511             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2512                         compat_urllib_parse.urlencode({'uri': mediaId}))
2513             configReq = compat_urllib_request.Request(configUrl)
2514             self.report_config_download(epTitle)
2515             try:
2516                 configXml = compat_urllib_request.urlopen(configReq).read()
2517             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2518                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2519                 return
2520
2521             cdoc = xml.etree.ElementTree.fromstring(configXml)
2522             turls = []
2523             for rendition in cdoc.findall('.//rendition'):
2524                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2525                 turls.append(finfo)
2526
2527             if len(turls) == 0:
2528                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2529                 continue
2530             
2531             if self._downloader.params.get('listformats', None):
2532                 self._print_formats([i[0] for i in turls])
2533                 return
2534
2535             # For now, just pick the highest bitrate
2536             format,video_url = turls[-1]
2537
2538             # Get the format arg from the arg stream
2539             req_format = self._downloader.params.get('format', None)
2540
2541             # Select format if we can find one
2542             for f,v in turls:
2543                 if f == req_format:
2544                     format, video_url = f, v
2545                     break
2546
2547             # Patch to download from alternative CDN, which does not
2548             # break on current RTMPDump builds
2549             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2550             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2551
2552             if video_url.startswith(broken_cdn):
2553                 video_url = video_url.replace(broken_cdn, better_cdn)
2554
2555             effTitle = showId + u'-' + epTitle
2556             info = {
2557                 'id': shortMediaId,
2558                 'url': video_url,
2559                 'uploader': showId,
2560                 'upload_date': officialDate,
2561                 'title': effTitle,
2562                 'ext': 'mp4',
2563                 'format': format,
2564                 'thumbnail': None,
2565                 'description': officialTitle,
2566                 'player_url': None #playerUrl
2567             }
2568
2569             results.append(info)
2570             
2571         return results
2572
2573
2574 class EscapistIE(InfoExtractor):
2575     """Information extractor for The Escapist """
2576
2577     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2578     IE_NAME = u'escapist'
2579
2580     def report_extraction(self, showName):
2581         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2582
2583     def report_config_download(self, showName):
2584         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2585
2586     def _real_extract(self, url):
2587         mobj = re.match(self._VALID_URL, url)
2588         if mobj is None:
2589             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2590             return
2591         showName = mobj.group('showname')
2592         videoId = mobj.group('episode')
2593
2594         self.report_extraction(showName)
2595         try:
2596             webPage = compat_urllib_request.urlopen(url)
2597             webPageBytes = webPage.read()
2598             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2599             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2601             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2602             return
2603
2604         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2605         description = unescapeHTML(descMatch.group(1))
2606         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2607         imgUrl = unescapeHTML(imgMatch.group(1))
2608         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2609         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2610         configUrlMatch = re.search('config=(.*)$', playerUrl)
2611         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2612
2613         self.report_config_download(showName)
2614         try:
2615             configJSON = compat_urllib_request.urlopen(configUrl).read()
2616         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2617             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2618             return
2619
2620         # Technically, it's JavaScript, not JSON
2621         configJSON = configJSON.replace("'", '"')
2622
2623         try:
2624             config = json.loads(configJSON)
2625         except (ValueError,) as err:
2626             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2627             return
2628
2629         playlist = config['playlist']
2630         videoUrl = playlist[1]['url']
2631
2632         info = {
2633             'id': videoId,
2634             'url': videoUrl,
2635             'uploader': showName,
2636             'upload_date': None,
2637             'title': showName,
2638             'ext': 'flv',
2639             'thumbnail': imgUrl,
2640             'description': description,
2641             'player_url': playerUrl,
2642         }
2643
2644         return [info]
2645
2646
2647 class CollegeHumorIE(InfoExtractor):
2648     """Information extractor for collegehumor.com"""
2649
2650     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2651     IE_NAME = u'collegehumor'
2652
2653     def report_webpage(self, video_id):
2654         """Report information extraction."""
2655         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2656
2657     def report_extraction(self, video_id):
2658         """Report information extraction."""
2659         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2660
2661     def _real_extract(self, url):
2662         mobj = re.match(self._VALID_URL, url)
2663         if mobj is None:
2664             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2665             return
2666         video_id = mobj.group('videoid')
2667
2668         self.report_webpage(video_id)
2669         request = compat_urllib_request.Request(url)
2670         try:
2671             webpage = compat_urllib_request.urlopen(request).read()
2672         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2673             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2674             return
2675
2676         m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2677         if m is None:
2678             self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2679             return
2680         internal_video_id = m.group('internalvideoid')
2681
2682         info = {
2683             'id': video_id,
2684             'internal_id': internal_video_id,
2685             'uploader': None,
2686             'upload_date': None,
2687         }
2688
2689         self.report_extraction(video_id)
2690         xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2691         try:
2692             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2695             return
2696
2697         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2698         try:
2699             videoNode = mdoc.findall('./video')[0]
2700             info['description'] = videoNode.findall('./description')[0].text
2701             info['title'] = videoNode.findall('./caption')[0].text
2702             info['url'] = videoNode.findall('./file')[0].text
2703             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2704             info['ext'] = info['url'].rpartition('.')[2]
2705         except IndexError:
2706             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2707             return
2708
2709         return [info]
2710
2711
2712 class XVideosIE(InfoExtractor):
2713     """Information extractor for xvideos.com"""
2714
2715     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2716     IE_NAME = u'xvideos'
2717
2718     def report_webpage(self, video_id):
2719         """Report information extraction."""
2720         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2721
2722     def report_extraction(self, video_id):
2723         """Report information extraction."""
2724         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2725
2726     def _real_extract(self, url):
2727         mobj = re.match(self._VALID_URL, url)
2728         if mobj is None:
2729             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2730             return
2731         video_id = mobj.group(1).decode('utf-8')
2732
2733         self.report_webpage(video_id)
2734
2735         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2736         try:
2737             webpage = compat_urllib_request.urlopen(request).read()
2738         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2740             return
2741
2742         self.report_extraction(video_id)
2743
2744
2745         # Extract video URL
2746         mobj = re.search(r'flv_url=(.+?)&', webpage)
2747         if mobj is None:
2748             self._downloader.trouble(u'ERROR: unable to extract video url')
2749             return
2750         video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2751
2752
2753         # Extract title
2754         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2755         if mobj is None:
2756             self._downloader.trouble(u'ERROR: unable to extract video title')
2757             return
2758         video_title = mobj.group(1).decode('utf-8')
2759
2760
2761         # Extract video thumbnail
2762         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2763         if mobj is None:
2764             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2765             return
2766         video_thumbnail = mobj.group(0).decode('utf-8')
2767
2768         info = {
2769             'id': video_id,
2770             'url': video_url,
2771             'uploader': None,
2772             'upload_date': None,
2773             'title': video_title,
2774             'ext': 'flv',
2775             'thumbnail': video_thumbnail,
2776             'description': None,
2777         }
2778
2779         return [info]
2780
2781
2782 class SoundcloudIE(InfoExtractor):
2783     """Information extractor for soundcloud.com
2784        To access the media, the uid of the song and a stream token
2785        must be extracted from the page source and the script must make
2786        a request to media.soundcloud.com/crossdomain.xml. Then
2787        the media can be grabbed by requesting from an url composed
2788        of the stream token and uid
2789      """
2790
2791     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2792     IE_NAME = u'soundcloud'
2793
2794     def __init__(self, downloader=None):
2795         InfoExtractor.__init__(self, downloader)
2796
2797     def report_webpage(self, video_id):
2798         """Report information extraction."""
2799         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2800
2801     def report_extraction(self, video_id):
2802         """Report information extraction."""
2803         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2804
2805     def _real_extract(self, url):
2806         mobj = re.match(self._VALID_URL, url)
2807         if mobj is None:
2808             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2809             return
2810
2811         # extract uploader (which is in the url)
2812         uploader = mobj.group(1).decode('utf-8')
2813         # extract simple title (uploader + slug of song title)
2814         slug_title =  mobj.group(2).decode('utf-8')
2815         simple_title = uploader + u'-' + slug_title
2816
2817         self.report_webpage('%s/%s' % (uploader, slug_title))
2818
2819         request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2820         try:
2821             webpage = compat_urllib_request.urlopen(request).read()
2822         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2823             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2824             return
2825
2826         self.report_extraction('%s/%s' % (uploader, slug_title))
2827
2828         # extract uid and stream token that soundcloud hands out for access
2829         mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2830         if mobj:
2831             video_id = mobj.group(1)
2832             stream_token = mobj.group(2)
2833
2834         # extract unsimplified title
2835         mobj = re.search('"title":"(.*?)",', webpage)
2836         if mobj:
2837             title = mobj.group(1).decode('utf-8')
2838         else:
2839             title = simple_title
2840
2841         # construct media url (with uid/token)
2842         mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2843         mediaURL = mediaURL % (video_id, stream_token)
2844
2845         # description
2846         description = u'No description available'
2847         mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2848         if mobj:
2849             description = mobj.group(1)
2850
2851         # upload date
2852         upload_date = None
2853         mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2854         if mobj:
2855             try:
2856                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2857             except Exception as err:
2858                 self._downloader.to_stderr(compat_str(err))
2859
2860         # for soundcloud, a request to a cross domain is required for cookies
2861         request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2862
2863         return [{
2864             'id':       video_id.decode('utf-8'),
2865             'url':      mediaURL,
2866             'uploader': uploader.decode('utf-8'),
2867             'upload_date':  upload_date,
2868             'title':    title,
2869             'ext':      u'mp3',
2870             'description': description.decode('utf-8')
2871         }]
2872
2873
2874 class InfoQIE(InfoExtractor):
2875     """Information extractor for infoq.com"""
2876
2877     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2878     IE_NAME = u'infoq'
2879
2880     def report_webpage(self, video_id):
2881         """Report information extraction."""
2882         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2883
2884     def report_extraction(self, video_id):
2885         """Report information extraction."""
2886         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2887
2888     def _real_extract(self, url):
2889         mobj = re.match(self._VALID_URL, url)
2890         if mobj is None:
2891             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2892             return
2893
2894         self.report_webpage(url)
2895
2896         request = compat_urllib_request.Request(url)
2897         try:
2898             webpage = compat_urllib_request.urlopen(request).read()
2899         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2901             return
2902
2903         self.report_extraction(url)
2904
2905
2906         # Extract video URL
2907         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2908         if mobj is None:
2909             self._downloader.trouble(u'ERROR: unable to extract video url')
2910             return
2911         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2912
2913
2914         # Extract title
2915         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2916         if mobj is None:
2917             self._downloader.trouble(u'ERROR: unable to extract video title')
2918             return
2919         video_title = mobj.group(1).decode('utf-8')
2920
2921         # Extract description
2922         video_description = u'No description available.'
2923         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2924         if mobj is not None:
2925             video_description = mobj.group(1).decode('utf-8')
2926
2927         video_filename = video_url.split('/')[-1]
2928         video_id, extension = video_filename.split('.')
2929
2930         info = {
2931             'id': video_id,
2932             'url': video_url,
2933             'uploader': None,
2934             'upload_date': None,
2935             'title': video_title,
2936             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2937             'thumbnail': None,
2938             'description': video_description,
2939         }
2940
2941         return [info]
2942
2943 class MixcloudIE(InfoExtractor):
2944     """Information extractor for www.mixcloud.com"""
2945     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2946     IE_NAME = u'mixcloud'
2947
2948     def __init__(self, downloader=None):
2949         InfoExtractor.__init__(self, downloader)
2950
2951     def report_download_json(self, file_id):
2952         """Report JSON download."""
2953         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2954
2955     def report_extraction(self, file_id):
2956         """Report information extraction."""
2957         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2958
2959     def get_urls(self, jsonData, fmt, bitrate='best'):
2960         """Get urls from 'audio_formats' section in json"""
2961         file_url = None
2962         try:
2963             bitrate_list = jsonData[fmt]
2964             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2965                 bitrate = max(bitrate_list) # select highest
2966
2967             url_list = jsonData[fmt][bitrate]
2968         except TypeError: # we have no bitrate info.
2969             url_list = jsonData[fmt]
2970         return url_list
2971
2972     def check_urls(self, url_list):
2973         """Returns 1st active url from list"""
2974         for url in url_list:
2975             try:
2976                 compat_urllib_request.urlopen(url)
2977                 return url
2978             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2979                 url = None
2980
2981         return None
2982
2983     def _print_formats(self, formats):
2984         print('Available formats:')
2985         for fmt in formats.keys():
2986             for b in formats[fmt]:
2987                 try:
2988                     ext = formats[fmt][b][0]
2989                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2990                 except TypeError: # we have no bitrate info
2991                     ext = formats[fmt][0]
2992                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2993                     break
2994
2995     def _real_extract(self, url):
2996         mobj = re.match(self._VALID_URL, url)
2997         if mobj is None:
2998             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2999             return
3000         # extract uploader & filename from url
3001         uploader = mobj.group(1).decode('utf-8')
3002         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3003
3004         # construct API request
3005         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3006         # retrieve .json file with links to files
3007         request = compat_urllib_request.Request(file_url)
3008         try:
3009             self.report_download_json(file_url)
3010             jsonData = compat_urllib_request.urlopen(request).read()
3011         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3013             return
3014
3015         # parse JSON
3016         json_data = json.loads(jsonData)
3017         player_url = json_data['player_swf_url']
3018         formats = dict(json_data['audio_formats'])
3019
3020         req_format = self._downloader.params.get('format', None)
3021         bitrate = None
3022
3023         if self._downloader.params.get('listformats', None):
3024             self._print_formats(formats)
3025             return
3026
3027         if req_format is None or req_format == 'best':
3028             for format_param in formats.keys():
3029                 url_list = self.get_urls(formats, format_param)
3030                 # check urls
3031                 file_url = self.check_urls(url_list)
3032                 if file_url is not None:
3033                     break # got it!
3034         else:
3035             if req_format not in formats.keys():
3036                 self._downloader.trouble(u'ERROR: format is not available')
3037                 return
3038
3039             url_list = self.get_urls(formats, req_format)
3040             file_url = self.check_urls(url_list)
3041             format_param = req_format
3042
3043         return [{
3044             'id': file_id.decode('utf-8'),
3045             'url': file_url.decode('utf-8'),
3046             'uploader': uploader.decode('utf-8'),
3047             'upload_date': None,
3048             'title': json_data['name'],
3049             'ext': file_url.split('.')[-1].decode('utf-8'),
3050             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3051             'thumbnail': json_data['thumbnail_url'],
3052             'description': json_data['description'],
3053             'player_url': player_url.decode('utf-8'),
3054         }]
3055
3056 class StanfordOpenClassroomIE(InfoExtractor):
3057     """Information extractor for Stanford's Open ClassRoom"""
3058
3059     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3060     IE_NAME = u'stanfordoc'
3061
3062     def report_download_webpage(self, objid):
3063         """Report information extraction."""
3064         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3065
3066     def report_extraction(self, video_id):
3067         """Report information extraction."""
3068         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3069
3070     def _real_extract(self, url):
3071         mobj = re.match(self._VALID_URL, url)
3072         if mobj is None:
3073             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3074             return
3075
3076         if mobj.group('course') and mobj.group('video'): # A specific video
3077             course = mobj.group('course')
3078             video = mobj.group('video')
3079             info = {
3080                 'id': course + '_' + video,
3081                 'uploader': None,
3082                 'upload_date': None,
3083             }
3084
3085             self.report_extraction(info['id'])
3086             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3087             xmlUrl = baseUrl + video + '.xml'
3088             try:
3089                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3090             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3092                 return
3093             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3094             try:
3095                 info['title'] = mdoc.findall('./title')[0].text
3096                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3097             except IndexError:
3098                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3099                 return
3100             info['ext'] = info['url'].rpartition('.')[2]
3101             return [info]
3102         elif mobj.group('course'): # A course page
3103             course = mobj.group('course')
3104             info = {
3105                 'id': course,
3106                 'type': 'playlist',
3107                 'uploader': None,
3108                 'upload_date': None,
3109             }
3110
3111             self.report_download_webpage(info['id'])
3112             try:
3113                 coursepage = compat_urllib_request.urlopen(url).read()
3114             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3116                 return
3117
3118             m = re.search('<h1>([^<]+)</h1>', coursepage)
3119             if m:
3120                 info['title'] = unescapeHTML(m.group(1))
3121             else:
3122                 info['title'] = info['id']
3123
3124             m = re.search('<description>([^<]+)</description>', coursepage)
3125             if m:
3126                 info['description'] = unescapeHTML(m.group(1))
3127
3128             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3129             info['list'] = [
3130                 {
3131                     'type': 'reference',
3132                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3133                 }
3134                     for vpage in links]
3135             results = []
3136             for entry in info['list']:
3137                 assert entry['type'] == 'reference'
3138                 results += self.extract(entry['url'])
3139             return results
3140             
3141         else: # Root page
3142             info = {
3143                 'id': 'Stanford OpenClassroom',
3144                 'type': 'playlist',
3145                 'uploader': None,
3146                 'upload_date': None,
3147             }
3148
3149             self.report_download_webpage(info['id'])
3150             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3151             try:
3152                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3153             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3154                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3155                 return
3156
3157             info['title'] = info['id']
3158
3159             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3160             info['list'] = [
3161                 {
3162                     'type': 'reference',
3163                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3164                 }
3165                     for cpage in links]
3166
3167             results = []
3168             for entry in info['list']:
3169                 assert entry['type'] == 'reference'
3170                 results += self.extract(entry['url'])
3171             return results
3172
3173 class MTVIE(InfoExtractor):
3174     """Information extractor for MTV.com"""
3175
3176     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3177     IE_NAME = u'mtv'
3178
3179     def report_webpage(self, video_id):
3180         """Report information extraction."""
3181         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3182
3183     def report_extraction(self, video_id):
3184         """Report information extraction."""
3185         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3186
3187     def _real_extract(self, url):
3188         mobj = re.match(self._VALID_URL, url)
3189         if mobj is None:
3190             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3191             return
3192         if not mobj.group('proto'):
3193             url = 'http://' + url
3194         video_id = mobj.group('videoid')
3195         self.report_webpage(video_id)
3196
3197         request = compat_urllib_request.Request(url)
3198         try:
3199             webpage = compat_urllib_request.urlopen(request).read()
3200         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3201             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3202             return
3203
3204         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3205         if mobj is None:
3206             self._downloader.trouble(u'ERROR: unable to extract song name')
3207             return
3208         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3210         if mobj is None:
3211             self._downloader.trouble(u'ERROR: unable to extract performer')
3212             return
3213         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214         video_title = performer + ' - ' + song_name 
3215
3216         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3217         if mobj is None:
3218             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3219             return
3220         mtvn_uri = mobj.group(1)
3221
3222         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3223         if mobj is None:
3224             self._downloader.trouble(u'ERROR: unable to extract content id')
3225             return
3226         content_id = mobj.group(1)
3227
3228         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3229         self.report_extraction(video_id)
3230         request = compat_urllib_request.Request(videogen_url)
3231         try:
3232             metadataXml = compat_urllib_request.urlopen(request).read()
3233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3234             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3235             return
3236
3237         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3238         renditions = mdoc.findall('.//rendition')
3239
3240         # For now, always pick the highest quality.
3241         rendition = renditions[-1]
3242
3243         try:
3244             _,_,ext = rendition.attrib['type'].partition('/')
3245             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3246             video_url = rendition.find('./src').text
3247         except KeyError:
3248             self._downloader.trouble('Invalid rendition field.')
3249             return
3250
3251         info = {
3252             'id': video_id,
3253             'url': video_url,
3254             'uploader': performer,
3255             'upload_date': None,
3256             'title': video_title,
3257             'ext': ext,
3258             'format': format,
3259         }
3260
3261         return [info]
3262
3263
3264 class YoukuIE(InfoExtractor):
3265
3266     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3267     IE_NAME = u'Youku'
3268
3269     def __init__(self, downloader=None):
3270         InfoExtractor.__init__(self, downloader)
3271
3272     def report_download_webpage(self, file_id):
3273         """Report webpage download."""
3274         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3275
3276     def report_extraction(self, file_id):
3277         """Report information extraction."""
3278         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3279
3280     def _gen_sid(self):
3281         nowTime = int(time.time() * 1000)
3282         random1 = random.randint(1000,1998)
3283         random2 = random.randint(1000,9999)
3284
3285         return "%d%d%d" %(nowTime,random1,random2)
3286
3287     def _get_file_ID_mix_string(self, seed):
3288         mixed = []
3289         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3290         seed = float(seed)
3291         for i in range(len(source)):
3292             seed  =  (seed * 211 + 30031 ) % 65536
3293             index  =  math.floor(seed / 65536 * len(source) )
3294             mixed.append(source[int(index)])
3295             source.remove(source[int(index)])
3296         #return ''.join(mixed)
3297         return mixed
3298
3299     def _get_file_id(self, fileId, seed):
3300         mixed = self._get_file_ID_mix_string(seed)
3301         ids = fileId.split('*')
3302         realId = []
3303         for ch in ids:
3304             if ch:
3305                 realId.append(mixed[int(ch)])
3306         return ''.join(realId)
3307
3308     def _real_extract(self, url):
3309         mobj = re.match(self._VALID_URL, url)
3310         if mobj is None:
3311             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3312             return
3313         video_id = mobj.group('ID')
3314
3315         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3316
3317         request = compat_urllib_request.Request(info_url, None, std_headers)
3318         try:
3319             self.report_download_webpage(video_id)
3320             jsondata = compat_urllib_request.urlopen(request).read()
3321         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3323             return
3324
3325         self.report_extraction(video_id)
3326         try:
3327             config = json.loads(jsondata)
3328
3329             video_title =  config['data'][0]['title']
3330             seed = config['data'][0]['seed']
3331
3332             format = self._downloader.params.get('format', None)
3333             supported_format = config['data'][0]['streamfileids'].keys()
3334
3335             if format is None or format == 'best':
3336                 if 'hd2' in supported_format:
3337                     format = 'hd2'
3338                 else:
3339                     format = 'flv'
3340                 ext = u'flv'
3341             elif format == 'worst':
3342                 format = 'mp4'
3343                 ext = u'mp4'
3344             else:
3345                 format = 'flv'
3346                 ext = u'flv'
3347
3348
3349             fileid = config['data'][0]['streamfileids'][format]
3350             seg_number = len(config['data'][0]['segs'][format])
3351
3352             keys=[]
3353             for i in xrange(seg_number):
3354                 keys.append(config['data'][0]['segs'][format][i]['k'])
3355
3356             #TODO check error
3357             #youku only could be viewed from mainland china
3358         except:
3359             self._downloader.trouble(u'ERROR: unable to extract info section')
3360             return
3361
3362         files_info=[]
3363         sid = self._gen_sid()
3364         fileid = self._get_file_id(fileid, seed)
3365
3366         #column 8,9 of fileid represent the segment number
3367         #fileid[7:9] should be changed
3368         for index, key in enumerate(keys):
3369
3370             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3371             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3372
3373             info = {
3374                 'id': '%s_part%02d' % (video_id, index),
3375                 'url': download_url,
3376                 'uploader': None,
3377                 'upload_date': None,
3378                 'title': video_title,
3379                 'ext': ext,
3380             }
3381             files_info.append(info)
3382
3383         return files_info
3384
3385
3386 class XNXXIE(InfoExtractor):
3387     """Information extractor for xnxx.com"""
3388
3389     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3390     IE_NAME = u'xnxx'
3391     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3392     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3393     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3394
3395     def report_webpage(self, video_id):
3396         """Report information extraction"""
3397         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3398
3399     def report_extraction(self, video_id):
3400         """Report information extraction"""
3401         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3402
3403     def _real_extract(self, url):
3404         mobj = re.match(self._VALID_URL, url)
3405         if mobj is None:
3406             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3407             return
3408         video_id = mobj.group(1).decode('utf-8')
3409
3410         self.report_webpage(video_id)
3411
3412         # Get webpage content
3413         try:
3414             webpage = compat_urllib_request.urlopen(url).read()
3415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3417             return
3418
3419         result = re.search(self.VIDEO_URL_RE, webpage)
3420         if result is None:
3421             self._downloader.trouble(u'ERROR: unable to extract video url')
3422             return
3423         video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3424
3425         result = re.search(self.VIDEO_TITLE_RE, webpage)
3426         if result is None:
3427             self._downloader.trouble(u'ERROR: unable to extract video title')
3428             return
3429         video_title = result.group(1).decode('utf-8')
3430
3431         result = re.search(self.VIDEO_THUMB_RE, webpage)
3432         if result is None:
3433             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3434             return
3435         video_thumbnail = result.group(1).decode('utf-8')
3436
3437         return [{
3438             'id': video_id,
3439             'url': video_url,
3440             'uploader': None,
3441             'upload_date': None,
3442             'title': video_title,
3443             'ext': 'flv',
3444             'thumbnail': video_thumbnail,
3445             'description': None,
3446         }]
3447
3448
3449 class GooglePlusIE(InfoExtractor):
3450     """Information extractor for plus.google.com."""
3451
3452     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3453     IE_NAME = u'plus.google'
3454
3455     def __init__(self, downloader=None):
3456         InfoExtractor.__init__(self, downloader)
3457
3458     def report_extract_entry(self, url):
3459         """Report downloading extry"""
3460         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3461
3462     def report_date(self, upload_date):
3463         """Report downloading extry"""
3464         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3465
3466     def report_uploader(self, uploader):
3467         """Report downloading extry"""
3468         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3469
3470     def report_title(self, video_title):
3471         """Report downloading extry"""
3472         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3473
3474     def report_extract_vid_page(self, video_page):
3475         """Report information extraction."""
3476         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3477
3478     def _real_extract(self, url):
3479         # Extract id from URL
3480         mobj = re.match(self._VALID_URL, url)
3481         if mobj is None:
3482             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3483             return
3484
3485         post_url = mobj.group(0)
3486         video_id = mobj.group(2)
3487
3488         video_extension = 'flv'
3489
3490         # Step 1, Retrieve post webpage to extract further information
3491         self.report_extract_entry(post_url)
3492         request = compat_urllib_request.Request(post_url)
3493         try:
3494             webpage = compat_urllib_request.urlopen(request).read()
3495         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3497             return
3498
3499         # Extract update date
3500         upload_date = None
3501         pattern = 'title="Timestamp">(.*?)</a>'
3502         mobj = re.search(pattern, webpage)
3503         if mobj:
3504             upload_date = mobj.group(1)
3505             # Convert timestring to a format suitable for filename
3506             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507             upload_date = upload_date.strftime('%Y%m%d')
3508         self.report_date(upload_date)
3509
3510         # Extract uploader
3511         uploader = None
3512         pattern = r'rel\="author".*?>(.*?)</a>'
3513         mobj = re.search(pattern, webpage)
3514         if mobj:
3515             uploader = mobj.group(1)
3516         self.report_uploader(uploader)
3517
3518         # Extract title
3519         # Get the first line for title
3520         video_title = u'NA'
3521         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522         mobj = re.search(pattern, webpage)
3523         if mobj:
3524             video_title = mobj.group(1)
3525         self.report_title(video_title)
3526
3527         # Step 2, Stimulate clicking the image box to launch video
3528         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529         mobj = re.search(pattern, webpage)
3530         if mobj is None:
3531             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3532
3533         video_page = mobj.group(1)
3534         request = compat_urllib_request.Request(video_page)
3535         try:
3536             webpage = compat_urllib_request.urlopen(request).read()
3537         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3539             return
3540         self.report_extract_vid_page(video_page)
3541
3542
3543         # Extract video links on video page
3544         """Extract video links of all sizes"""
3545         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546         mobj = re.findall(pattern, webpage)
3547         if len(mobj) == 0:
3548             self._downloader.trouble(u'ERROR: unable to extract video links')
3549
3550         # Sort in resolution
3551         links = sorted(mobj)
3552
3553         # Choose the lowest of the sort, i.e. highest resolution
3554         video_url = links[-1]
3555         # Only get the url. The resolution part in the tuple has no use anymore
3556         video_url = video_url[-1]
3557         # Treat escaped \u0026 style hex
3558         video_url = unicode(video_url, "unicode_escape")
3559
3560
3561         return [{
3562             'id':       video_id.decode('utf-8'),
3563             'url':      video_url,
3564             'uploader': uploader.decode('utf-8'),
3565             'upload_date':  upload_date.decode('utf-8'),
3566             'title':    video_title.decode('utf-8'),
3567             'ext':      video_extension.decode('utf-8'),
3568         }]