Merge remote-tracking branch 'gcmalloc/master' into fork_master
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21     """Information Extractor class.
22
23     Information extractors are the classes that, given a URL, extract
24     information about the video (or videos) the URL refers to. This
25     information includes the real video URL, the video title, author and
26     others. The information is stored in a dictionary which is then 
27     passed to the FileDownloader. The FileDownloader processes this
28     information possibly downloading the video to the file system, among
29     other possible outcomes.
30
31     The dictionaries must include the following fields:
32
33     id:             Video identifier.
34     url:            Final video URL.
35     uploader:       Nickname of the video uploader, unescaped.
36     upload_date:    Video upload date (YYYYMMDD).
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     player_url:     SWF Player URL (used for rtmpdump).
46     subtitles:      The .srt file contents.
47     urlhandle:      [internal] The urlHandle to be used to download the file,
48                     like returned by urllib.request.urlopen
49
50     The fields should all be Unicode strings.
51
52     Subclasses of this one should re-define the _real_initialize() and
53     _real_extract() methods and define a _VALID_URL regexp.
54     Probably, they should also be added to the list of extractors.
55
56     _real_extract() must return a *list* of information dictionaries as
57     described above.
58
59     Finally, the _WORKING attribute should be set to False for broken IEs
60     in order to warn the users and skip the tests.
61     """
62
63     _ready = False
64     _downloader = None
65     _WORKING = True
66
67     def __init__(self, downloader=None):
68         """Constructor. Receives an optional downloader."""
69         self._ready = False
70         self.set_downloader(downloader)
71
72     def suitable(self, url):
73         """Receives a URL and returns True if suitable for this IE."""
74         return re.match(self._VALID_URL, url) is not None
75
76     def working(self):
77         """Getter method for _WORKING."""
78         return self._WORKING
79
80     def initialize(self):
81         """Initializes an instance (authentication, etc)."""
82         if not self._ready:
83             self._real_initialize()
84             self._ready = True
85
86     def extract(self, url):
87         """Extracts URL information and returns it in list of dicts."""
88         self.initialize()
89         return self._real_extract(url)
90
91     def set_downloader(self, downloader):
92         """Sets the downloader for this IE."""
93         self._downloader = downloader
94
95     def _real_initialize(self):
96         """Real initialization process. Redefine in subclasses."""
97         pass
98
99     def _real_extract(self, url):
100         """Real extraction process. Redefine in subclasses."""
101         pass
102
103
104 class YoutubeIE(InfoExtractor):
105     """Information extractor for youtube.com."""
106
107     _VALID_URL = r"""^
108                      (
109                          (?:https?://)?                                       # http(s):// (optional)
110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
114                          (?:                                                  # the various things that can precede the ID:
115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
116                              |(?:                                             # or the v= param in all its forms
117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
120                                  v=
121                              )
122                          )?                                                   # optional -> youtube.com/xxxx is OK
123                      )?                                                       # all until now is optional -> you can pass the naked ID
124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
125                      (?(1).+)?                                                # if we found the ID, everything can follow
126                      $"""
127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131     _NETRC_MACHINE = 'youtube'
132     # Listed in order of quality
133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135     _video_extensions = {
136         '13': '3gp',
137         '17': 'mp4',
138         '18': 'mp4',
139         '22': 'mp4',
140         '37': 'mp4',
141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142         '43': 'webm',
143         '44': 'webm',
144         '45': 'webm',
145         '46': 'webm',
146     }
147     _video_dimensions = {
148         '5': '240x400',
149         '6': '???',
150         '13': '???',
151         '17': '144x176',
152         '18': '360x640',
153         '22': '720x1280',
154         '34': '360x640',
155         '35': '480x854',
156         '37': '1080x1920',
157         '38': '3072x4096',
158         '43': '360x640',
159         '44': '480x854',
160         '45': '720x1280',
161         '46': '1080x1920',
162     }   
163     IE_NAME = u'youtube'
164
165     def suitable(self, url):
166         """Receives a URL and returns True if suitable for this IE."""
167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169     def report_lang(self):
170         """Report attempt to set language."""
171         self._downloader.to_screen(u'[youtube] Setting language')
172
173     def report_login(self):
174         """Report attempt to log in."""
175         self._downloader.to_screen(u'[youtube] Logging in')
176
177     def report_age_confirmation(self):
178         """Report attempt to confirm age."""
179         self._downloader.to_screen(u'[youtube] Confirming age')
180
181     def report_video_webpage_download(self, video_id):
182         """Report attempt to download video webpage."""
183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185     def report_video_info_webpage_download(self, video_id):
186         """Report attempt to download video info webpage."""
187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189     def report_video_subtitles_download(self, video_id):
190         """Report attempt to download video info webpage."""
191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193     def report_information_extraction(self, video_id):
194         """Report attempt to extract video information."""
195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197     def report_unavailable_format(self, video_id, format):
198         """Report extracted video URL."""
199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201     def report_rtmp_download(self):
202         """Indicate the download will use the RTMP protocol."""
203         self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205     def _closed_captions_xml_to_srt(self, xml_string):
206         srt = ''
207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208         # TODO parse xml instead of regex
209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
210             if not dur: dur = '4'
211             start = float(start)
212             end = start + float(dur)
213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215             caption = unescapeHTML(caption)
216             caption = unescapeHTML(caption) # double cycle, intentional
217             srt += str(n+1) + '\n'
218             srt += start + ' --> ' + end + '\n'
219             srt += caption + '\n\n'
220         return srt
221
222     def _print_formats(self, formats):
223         print('Available formats:')
224         for x in formats:
225             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227     def _real_initialize(self):
228         if self._downloader is None:
229             return
230
231         username = None
232         password = None
233         downloader_params = self._downloader.params
234
235         # Attempt to use provided username and password or .netrc data
236         if downloader_params.get('username', None) is not None:
237             username = downloader_params['username']
238             password = downloader_params['password']
239         elif downloader_params.get('usenetrc', False):
240             try:
241                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242                 if info is not None:
243                     username = info[0]
244                     password = info[2]
245                 else:
246                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247             except (IOError, netrc.NetrcParseError) as err:
248                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249                 return
250
251         # Set language
252         request = compat_urllib_request.Request(self._LANG_URL)
253         try:
254             self.report_lang()
255             compat_urllib_request.urlopen(request).read()
256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258             return
259
260         # No authentication to be performed
261         if username is None:
262             return
263
264         # Log in
265         login_form = {
266                 'current_form': 'loginForm',
267                 'next':     '/',
268                 'action_login': 'Log In',
269                 'username': username,
270                 'password': password,
271                 }
272         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273         try:
274             self.report_login()
275             login_results = compat_urllib_request.urlopen(request).read()
276             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278                 return
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281             return
282
283         # Confirm age
284         age_form = {
285                 'next_url':     '/',
286                 'action_confirm':   'Confirm',
287                 }
288         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289         try:
290             self.report_age_confirmation()
291             age_results = compat_urllib_request.urlopen(request).read()
292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294             return
295
296     def _real_extract(self, url):
297         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298         mobj = re.search(self._NEXT_URL_RE, url)
299         if mobj:
300             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302         # Extract video id from URL
303         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304         if mobj is None:
305             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306             return
307         video_id = mobj.group(2)
308
309         # Get video webpage
310         self.report_video_webpage_download(video_id)
311         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312         try:
313             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316             return
317
318         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320         # Attempt to extract SWF player URL
321         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322         if mobj is not None:
323             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324         else:
325             player_url = None
326
327         # Get video info
328         self.report_video_info_webpage_download(video_id)
329         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331                     % (video_id, el_type))
332             request = compat_urllib_request.Request(video_info_url)
333             try:
334                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336                 video_info = compat_parse_qs(video_info_webpage)
337                 if 'token' in video_info:
338                     break
339             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341                 return
342         if 'token' not in video_info:
343             if 'reason' in video_info:
344                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345             else:
346                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347             return
348
349         # Check for "rental" videos
350         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351             self._downloader.trouble(u'ERROR: "rental" videos not supported')
352             return
353
354         # Start extracting information
355         self.report_information_extraction(video_id)
356
357         # uploader
358         if 'author' not in video_info:
359             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360             return
361         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363         # title
364         if 'title' not in video_info:
365             self._downloader.trouble(u'ERROR: unable to extract video title')
366             return
367         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369         # thumbnail image
370         if 'thumbnail_url' not in video_info:
371             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372             video_thumbnail = ''
373         else:   # don't panic if we can't find it
374             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376         # upload date
377         upload_date = None
378         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379         if mobj is not None:
380             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382             for expression in format_expressions:
383                 try:
384                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385                 except:
386                     pass
387
388         # description
389         video_description = get_element_by_id("eow-description", video_webpage)
390         if video_description:
391             video_description = clean_html(video_description)
392         else:
393             video_description = ''
394
395         # closed captions
396         video_subtitles = None
397         if self._downloader.params.get('writesubtitles', False):
398             try:
399                 self.report_video_subtitles_download(video_id)
400                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401                 try:
402                     srt_list = compat_urllib_request.urlopen(request).read()
403                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407                 if not srt_lang_list:
408                     raise Trouble(u'WARNING: video has no closed captions')
409                 if self._downloader.params.get('subtitleslang', False):
410                     srt_lang = self._downloader.params.get('subtitleslang')
411                 elif 'en' in srt_lang_list:
412                     srt_lang = 'en'
413                 else:
414                     srt_lang = srt_lang_list.keys()[0]
415                 if not srt_lang in srt_lang_list:
416                     raise Trouble(u'WARNING: no closed captions found in the specified language')
417                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418                 try:
419                     srt_xml = compat_urllib_request.urlopen(request).read()
420                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422                 if not srt_xml:
423                     raise Trouble(u'WARNING: unable to download video subtitles')
424                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
425             except Trouble as trouble:
426                 self._downloader.trouble(trouble[0])
427
428         if 'length_seconds' not in video_info:
429             self._downloader.trouble(u'WARNING: unable to extract video duration')
430             video_duration = ''
431         else:
432             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434         # token
435         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437         # Decide which formats to download
438         req_format = self._downloader.params.get('format', None)
439
440         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441             self.report_rtmp_download()
442             video_url_list = [(None, video_info['conn'][0])]
443         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449             format_limit = self._downloader.params.get('format_limit', None)
450             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451             if format_limit is not None and format_limit in available_formats:
452                 format_list = available_formats[available_formats.index(format_limit):]
453             else:
454                 format_list = available_formats
455             existing_formats = [x for x in format_list if x in url_map]
456             if len(existing_formats) == 0:
457                 self._downloader.trouble(u'ERROR: no known formats available for video')
458                 return
459             if self._downloader.params.get('listformats', None):
460                 self._print_formats(existing_formats)
461                 return
462             if req_format is None or req_format == 'best':
463                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464             elif req_format == 'worst':
465                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466             elif req_format in ('-1', 'all'):
467                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468             else:
469                 # Specific formats. We pick the first in a slash-delimeted sequence.
470                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471                 req_formats = req_format.split('/')
472                 video_url_list = None
473                 for rf in req_formats:
474                     if rf in url_map:
475                         video_url_list = [(rf, url_map[rf])]
476                         break
477                 if video_url_list is None:
478                     self._downloader.trouble(u'ERROR: requested format not available')
479                     return
480         else:
481             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482             return
483
484         results = []
485         for format_param, video_real_url in video_url_list:
486             # Extension
487             video_extension = self._video_extensions.get(format_param, 'flv')
488
489             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490                                               self._video_dimensions.get(format_param, '???'))
491
492             results.append({
493                 'id':       video_id,
494                 'url':      video_real_url,
495                 'uploader': video_uploader,
496                 'upload_date':  upload_date,
497                 'title':    video_title,
498                 'ext':      video_extension,
499                 'format':   video_format,
500                 'thumbnail':    video_thumbnail,
501                 'description':  video_description,
502                 'player_url':   player_url,
503                 'subtitles':    video_subtitles,
504                 'duration':     video_duration
505             })
506         return results
507
508
509 class MetacafeIE(InfoExtractor):
510     """Information Extractor for metacafe.com."""
511
512     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515     IE_NAME = u'metacafe'
516
517     def __init__(self, downloader=None):
518         InfoExtractor.__init__(self, downloader)
519
520     def report_disclaimer(self):
521         """Report disclaimer retrieval."""
522         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524     def report_age_confirmation(self):
525         """Report attempt to confirm age."""
526         self._downloader.to_screen(u'[metacafe] Confirming age')
527
528     def report_download_webpage(self, video_id):
529         """Report webpage download."""
530         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532     def report_extraction(self, video_id):
533         """Report information extraction."""
534         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536     def _real_initialize(self):
537         # Retrieve disclaimer
538         request = compat_urllib_request.Request(self._DISCLAIMER)
539         try:
540             self.report_disclaimer()
541             disclaimer = compat_urllib_request.urlopen(request).read()
542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544             return
545
546         # Confirm age
547         disclaimer_form = {
548             'filters': '0',
549             'submit': "Continue - I'm over 18",
550             }
551         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552         try:
553             self.report_age_confirmation()
554             disclaimer = compat_urllib_request.urlopen(request).read()
555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557             return
558
559     def _real_extract(self, url):
560         # Extract id and simplified title from URL
561         mobj = re.match(self._VALID_URL, url)
562         if mobj is None:
563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564             return
565
566         video_id = mobj.group(1)
567
568         # Check if video comes from YouTube
569         mobj2 = re.match(r'^yt-(.*)$', video_id)
570         if mobj2 is not None:
571             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572             return
573
574         # Retrieve video webpage to extract further information
575         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576         try:
577             self.report_download_webpage(video_id)
578             webpage = compat_urllib_request.urlopen(request).read()
579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581             return
582
583         # Extract URL, uploader and title from webpage
584         self.report_extraction(video_id)
585         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586         if mobj is not None:
587             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588             video_extension = mediaURL[-3:]
589
590             # Extract gdaKey if available
591             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592             if mobj is None:
593                 video_url = mediaURL
594             else:
595                 gdaKey = mobj.group(1)
596                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597         else:
598             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599             if mobj is None:
600                 self._downloader.trouble(u'ERROR: unable to extract media URL')
601                 return
602             vardict = compat_parse_qs(mobj.group(1))
603             if 'mediaData' not in vardict:
604                 self._downloader.trouble(u'ERROR: unable to extract media URL')
605                 return
606             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607             if mobj is None:
608                 self._downloader.trouble(u'ERROR: unable to extract media URL')
609                 return
610             mediaURL = mobj.group(1).replace('\\/', '/')
611             video_extension = mediaURL[-3:]
612             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615         if mobj is None:
616             self._downloader.trouble(u'ERROR: unable to extract title')
617             return
618         video_title = mobj.group(1).decode('utf-8')
619
620         mobj = re.search(r'submitter=(.*?);', webpage)
621         if mobj is None:
622             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623             return
624         video_uploader = mobj.group(1)
625
626         return [{
627             'id':       video_id.decode('utf-8'),
628             'url':      video_url.decode('utf-8'),
629             'uploader': video_uploader.decode('utf-8'),
630             'upload_date':  None,
631             'title':    video_title,
632             'ext':      video_extension.decode('utf-8'),
633         }]
634
635
636 class DailymotionIE(InfoExtractor):
637     """Information Extractor for Dailymotion"""
638
639     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640     IE_NAME = u'dailymotion'
641
642     def __init__(self, downloader=None):
643         InfoExtractor.__init__(self, downloader)
644
645     def report_download_webpage(self, video_id):
646         """Report webpage download."""
647         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649     def report_extraction(self, video_id):
650         """Report information extraction."""
651         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653     def _real_extract(self, url):
654         # Extract id and simplified title from URL
655         mobj = re.match(self._VALID_URL, url)
656         if mobj is None:
657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658             return
659
660         video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662         video_extension = 'mp4'
663
664         # Retrieve video webpage to extract further information
665         request = compat_urllib_request.Request(url)
666         request.add_header('Cookie', 'family_filter=off')
667         try:
668             self.report_download_webpage(video_id)
669             webpage = compat_urllib_request.urlopen(request).read()
670         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
671             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
672             return
673
674         # Extract URL, uploader and title from webpage
675         self.report_extraction(video_id)
676         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
677         if mobj is None:
678             self._downloader.trouble(u'ERROR: unable to extract media URL')
679             return
680         flashvars = compat_urllib_parse.unquote(mobj.group(1))
681
682         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
683             if key in flashvars:
684                 max_quality = key
685                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
686                 break
687         else:
688             self._downloader.trouble(u'ERROR: unable to extract video URL')
689             return
690
691         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
692         if mobj is None:
693             self._downloader.trouble(u'ERROR: unable to extract video URL')
694             return
695
696         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
697
698         # TODO: support choosing qualities
699
700         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
701         if mobj is None:
702             self._downloader.trouble(u'ERROR: unable to extract title')
703             return
704         video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
705
706         video_uploader = None
707         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
708         if mobj is None:
709             # lookin for official user
710             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
711             if mobj_official is None:
712                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
713             else:
714                 video_uploader = mobj_official.group(1)
715         else:
716             video_uploader = mobj.group(1)
717
718         video_upload_date = None
719         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
720         if mobj is not None:
721             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
722
723         return [{
724             'id':       video_id.decode('utf-8'),
725             'url':      video_url.decode('utf-8'),
726             'uploader': video_uploader.decode('utf-8'),
727             'upload_date':  video_upload_date,
728             'title':    video_title,
729             'ext':      video_extension.decode('utf-8'),
730         }]
731
732
733 class GoogleIE(InfoExtractor):
734     """Information extractor for video.google.com."""
735
736     _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
737     IE_NAME = u'video.google'
738
739     def __init__(self, downloader=None):
740         InfoExtractor.__init__(self, downloader)
741
742     def report_download_webpage(self, video_id):
743         """Report webpage download."""
744         self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
745
746     def report_extraction(self, video_id):
747         """Report information extraction."""
748         self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
749
750     def _real_extract(self, url):
751         # Extract id from URL
752         mobj = re.match(self._VALID_URL, url)
753         if mobj is None:
754             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
755             return
756
757         video_id = mobj.group(1)
758
759         video_extension = 'mp4'
760
761         # Retrieve video webpage to extract further information
762         request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
763         try:
764             self.report_download_webpage(video_id)
765             webpage = compat_urllib_request.urlopen(request).read()
766         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
767             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
768             return
769
770         # Extract URL, uploader, and title from webpage
771         self.report_extraction(video_id)
772         mobj = re.search(r"download_url:'([^']+)'", webpage)
773         if mobj is None:
774             video_extension = 'flv'
775             mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
776         if mobj is None:
777             self._downloader.trouble(u'ERROR: unable to extract media URL')
778             return
779         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
780         mediaURL = mediaURL.replace('\\x3d', '\x3d')
781         mediaURL = mediaURL.replace('\\x26', '\x26')
782
783         video_url = mediaURL
784
785         mobj = re.search(r'<title>(.*)</title>', webpage)
786         if mobj is None:
787             self._downloader.trouble(u'ERROR: unable to extract title')
788             return
789         video_title = mobj.group(1).decode('utf-8')
790
791         # Extract video description
792         mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
793         if mobj is None:
794             self._downloader.trouble(u'ERROR: unable to extract video description')
795             return
796         video_description = mobj.group(1).decode('utf-8')
797         if not video_description:
798             video_description = 'No description available.'
799
800         # Extract video thumbnail
801         if self._downloader.params.get('forcethumbnail', False):
802             request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
803             try:
804                 webpage = compat_urllib_request.urlopen(request).read()
805             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
806                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
807                 return
808             mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
809             if mobj is None:
810                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
811                 return
812             video_thumbnail = mobj.group(1)
813         else:   # we need something to pass to process_info
814             video_thumbnail = ''
815
816         return [{
817             'id':       video_id.decode('utf-8'),
818             'url':      video_url.decode('utf-8'),
819             'uploader': None,
820             'upload_date':  None,
821             'title':    video_title,
822             'ext':      video_extension.decode('utf-8'),
823         }]
824
825
826 class PhotobucketIE(InfoExtractor):
827     """Information extractor for photobucket.com."""
828
829     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
830     IE_NAME = u'photobucket'
831
832     def __init__(self, downloader=None):
833         InfoExtractor.__init__(self, downloader)
834
835     def report_download_webpage(self, video_id):
836         """Report webpage download."""
837         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
838
839     def report_extraction(self, video_id):
840         """Report information extraction."""
841         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
842
843     def _real_extract(self, url):
844         # Extract id from URL
845         mobj = re.match(self._VALID_URL, url)
846         if mobj is None:
847             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
848             return
849
850         video_id = mobj.group(1)
851
852         video_extension = 'flv'
853
854         # Retrieve video webpage to extract further information
855         request = compat_urllib_request.Request(url)
856         try:
857             self.report_download_webpage(video_id)
858             webpage = compat_urllib_request.urlopen(request).read()
859         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861             return
862
863         # Extract URL, uploader, and title from webpage
864         self.report_extraction(video_id)
865         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
866         if mobj is None:
867             self._downloader.trouble(u'ERROR: unable to extract media URL')
868             return
869         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
870
871         video_url = mediaURL
872
873         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
874         if mobj is None:
875             self._downloader.trouble(u'ERROR: unable to extract title')
876             return
877         video_title = mobj.group(1).decode('utf-8')
878
879         video_uploader = mobj.group(2).decode('utf-8')
880
881         return [{
882             'id':       video_id.decode('utf-8'),
883             'url':      video_url.decode('utf-8'),
884             'uploader': video_uploader,
885             'upload_date':  None,
886             'title':    video_title,
887             'ext':      video_extension.decode('utf-8'),
888         }]
889
890
891 class YahooIE(InfoExtractor):
892     """Information extractor for video.yahoo.com."""
893
894     # _VALID_URL matches all Yahoo! Video URLs
895     # _VPAGE_URL matches only the extractable '/watch/' URLs
896     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
897     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
898     IE_NAME = u'video.yahoo'
899
900     def __init__(self, downloader=None):
901         InfoExtractor.__init__(self, downloader)
902
903     def report_download_webpage(self, video_id):
904         """Report webpage download."""
905         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
906
907     def report_extraction(self, video_id):
908         """Report information extraction."""
909         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
910
911     def _real_extract(self, url, new_video=True):
912         # Extract ID from URL
913         mobj = re.match(self._VALID_URL, url)
914         if mobj is None:
915             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
916             return
917
918         video_id = mobj.group(2)
919         video_extension = 'flv'
920
921         # Rewrite valid but non-extractable URLs as
922         # extractable English language /watch/ URLs
923         if re.match(self._VPAGE_URL, url) is None:
924             request = compat_urllib_request.Request(url)
925             try:
926                 webpage = compat_urllib_request.urlopen(request).read()
927             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
928                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
929                 return
930
931             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
932             if mobj is None:
933                 self._downloader.trouble(u'ERROR: Unable to extract id field')
934                 return
935             yahoo_id = mobj.group(1)
936
937             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
938             if mobj is None:
939                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
940                 return
941             yahoo_vid = mobj.group(1)
942
943             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
944             return self._real_extract(url, new_video=False)
945
946         # Retrieve video webpage to extract further information
947         request = compat_urllib_request.Request(url)
948         try:
949             self.report_download_webpage(video_id)
950             webpage = compat_urllib_request.urlopen(request).read()
951         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
952             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
953             return
954
955         # Extract uploader and title from webpage
956         self.report_extraction(video_id)
957         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
958         if mobj is None:
959             self._downloader.trouble(u'ERROR: unable to extract video title')
960             return
961         video_title = mobj.group(1).decode('utf-8')
962
963         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
964         if mobj is None:
965             self._downloader.trouble(u'ERROR: unable to extract video uploader')
966             return
967         video_uploader = mobj.group(1).decode('utf-8')
968
969         # Extract video thumbnail
970         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
971         if mobj is None:
972             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
973             return
974         video_thumbnail = mobj.group(1).decode('utf-8')
975
976         # Extract video description
977         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
978         if mobj is None:
979             self._downloader.trouble(u'ERROR: unable to extract video description')
980             return
981         video_description = mobj.group(1).decode('utf-8')
982         if not video_description:
983             video_description = 'No description available.'
984
985         # Extract video height and width
986         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
987         if mobj is None:
988             self._downloader.trouble(u'ERROR: unable to extract video height')
989             return
990         yv_video_height = mobj.group(1)
991
992         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
993         if mobj is None:
994             self._downloader.trouble(u'ERROR: unable to extract video width')
995             return
996         yv_video_width = mobj.group(1)
997
998         # Retrieve video playlist to extract media URL
999         # I'm not completely sure what all these options are, but we
1000         # seem to need most of them, otherwise the server sends a 401.
1001         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1002         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1003         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1004                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1005                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006         try:
1007             self.report_download_webpage(video_id)
1008             webpage = compat_urllib_request.urlopen(request).read()
1009         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1010             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011             return
1012
1013         # Extract media URL from playlist XML
1014         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1015         if mobj is None:
1016             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1017             return
1018         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1019         video_url = unescapeHTML(video_url)
1020
1021         return [{
1022             'id':       video_id.decode('utf-8'),
1023             'url':      video_url,
1024             'uploader': video_uploader,
1025             'upload_date':  None,
1026             'title':    video_title,
1027             'ext':      video_extension.decode('utf-8'),
1028             'thumbnail':    video_thumbnail.decode('utf-8'),
1029             'description':  video_description,
1030         }]
1031
1032
1033 class VimeoIE(InfoExtractor):
1034     """Information extractor for vimeo.com."""
1035
1036     # _VALID_URL matches Vimeo URLs
1037     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1038     IE_NAME = u'vimeo'
1039
1040     def __init__(self, downloader=None):
1041         InfoExtractor.__init__(self, downloader)
1042
1043     def report_download_webpage(self, video_id):
1044         """Report webpage download."""
1045         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1046
1047     def report_extraction(self, video_id):
1048         """Report information extraction."""
1049         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1050
1051     def _real_extract(self, url, new_video=True):
1052         # Extract ID from URL
1053         mobj = re.match(self._VALID_URL, url)
1054         if mobj is None:
1055             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1056             return
1057
1058         video_id = mobj.group(1)
1059
1060         # Retrieve video webpage to extract further information
1061         request = compat_urllib_request.Request(url, None, std_headers)
1062         try:
1063             self.report_download_webpage(video_id)
1064             webpage = compat_urllib_request.urlopen(request).read()
1065         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1066             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1067             return
1068
1069         # Now we begin extracting as much information as we can from what we
1070         # retrieved. First we extract the information common to all extractors,
1071         # and latter we extract those that are Vimeo specific.
1072         self.report_extraction(video_id)
1073
1074         # Extract the config JSON
1075         try:
1076             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1077             config = json.loads(config)
1078         except:
1079             self._downloader.trouble(u'ERROR: unable to extract info section')
1080             return
1081         
1082         # Extract title
1083         video_title = config["video"]["title"]
1084
1085         # Extract uploader
1086         video_uploader = config["video"]["owner"]["name"]
1087
1088         # Extract video thumbnail
1089         video_thumbnail = config["video"]["thumbnail"]
1090
1091         # Extract video description
1092         video_description = get_element_by_id("description", webpage.decode('utf8'))
1093         if video_description: video_description = clean_html(video_description)
1094         else: video_description = ''
1095
1096         # Extract upload date
1097         video_upload_date = None
1098         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1099         if mobj is not None:
1100             video_upload_date = mobj.group(1)
1101
1102         # Vimeo specific: extract request signature and timestamp
1103         sig = config['request']['signature']
1104         timestamp = config['request']['timestamp']
1105
1106         # Vimeo specific: extract video codec and quality information
1107         # First consider quality, then codecs, then take everything
1108         # TODO bind to format param
1109         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1110         files = { 'hd': [], 'sd': [], 'other': []}
1111         for codec_name, codec_extension in codecs:
1112             if codec_name in config["video"]["files"]:
1113                 if 'hd' in config["video"]["files"][codec_name]:
1114                     files['hd'].append((codec_name, codec_extension, 'hd'))
1115                 elif 'sd' in config["video"]["files"][codec_name]:
1116                     files['sd'].append((codec_name, codec_extension, 'sd'))
1117                 else:
1118                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1119
1120         for quality in ('hd', 'sd', 'other'):
1121             if len(files[quality]) > 0:
1122                 video_quality = files[quality][0][2]
1123                 video_codec = files[quality][0][0]
1124                 video_extension = files[quality][0][1]
1125                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1126                 break
1127         else:
1128             self._downloader.trouble(u'ERROR: no known codec found')
1129             return
1130
1131         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1132                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133
1134         return [{
1135             'id':       video_id,
1136             'url':      video_url,
1137             'uploader': video_uploader,
1138             'upload_date':  video_upload_date,
1139             'title':    video_title,
1140             'ext':      video_extension,
1141             'thumbnail':    video_thumbnail,
1142             'description':  video_description,
1143         }]
1144
1145
1146 class ArteTvIE(InfoExtractor):
1147     """arte.tv information extractor."""
1148
1149     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1150     _LIVE_URL = r'index-[0-9]+\.html$'
1151
1152     IE_NAME = u'arte.tv'
1153
1154     def __init__(self, downloader=None):
1155         InfoExtractor.__init__(self, downloader)
1156
1157     def report_download_webpage(self, video_id):
1158         """Report webpage download."""
1159         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1160
1161     def report_extraction(self, video_id):
1162         """Report information extraction."""
1163         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1164
1165     def fetch_webpage(self, url):
1166         self._downloader.increment_downloads()
1167         request = compat_urllib_request.Request(url)
1168         try:
1169             self.report_download_webpage(url)
1170             webpage = compat_urllib_request.urlopen(request).read()
1171         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1172             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1173             return
1174         except ValueError as err:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177         return webpage
1178
1179     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1180         page = self.fetch_webpage(url)
1181         mobj = re.search(regex, page, regexFlags)
1182         info = {}
1183
1184         if mobj is None:
1185             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186             return
1187
1188         for (i, key, err) in matchTuples:
1189             if mobj.group(i) is None:
1190                 self._downloader.trouble(err)
1191                 return
1192             else:
1193                 info[key] = mobj.group(i)
1194
1195         return info
1196
1197     def extractLiveStream(self, url):
1198         video_lang = url.split('/')[-4]
1199         info = self.grep_webpage(
1200             url,
1201             r'src="(.*?/videothek_js.*?\.js)',
1202             0,
1203             [
1204                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1205             ]
1206         )
1207         http_host = url.split('/')[2]
1208         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212                 '(http://.*?\.swf).*?' +
1213                 '(rtmp://.*?)\'',
1214             re.DOTALL,
1215             [
1216                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1217                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1218                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1219             ]
1220         )
1221         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223     def extractPlus7Stream(self, url):
1224         video_lang = url.split('/')[-3]
1225         info = self.grep_webpage(
1226             url,
1227             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228             0,
1229             [
1230                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1231             ]
1232         )
1233         next_url = compat_urllib_parse.unquote(info.get('url'))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237             0,
1238             [
1239                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video id="(.*?)".*?>.*?' +
1247                 '<name>(.*?)</name>.*?' +
1248                 '<dateVideo>(.*?)</dateVideo>.*?' +
1249                 '<url quality="hd">(.*?)</url>',
1250             re.DOTALL,
1251             [
1252                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1253                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1254                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1255                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1256             ]
1257         )
1258
1259         return {
1260             'id':           info.get('id'),
1261             'url':          compat_urllib_parse.unquote(info.get('url')),
1262             'uploader':     u'arte.tv',
1263             'upload_date':  info.get('date'),
1264             'title':        info.get('title'),
1265             'ext':          u'mp4',
1266             'format':       u'NA',
1267             'player_url':   None,
1268         }
1269
1270     def _real_extract(self, url):
1271         video_id = url.split('/')[-1]
1272         self.report_extraction(video_id)
1273
1274         if re.search(self._LIVE_URL, video_id) is not None:
1275             self.extractLiveStream(url)
1276             return
1277         else:
1278             info = self.extractPlus7Stream(url)
1279
1280         return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284     """Generic last-resort information extractor."""
1285
1286     _VALID_URL = r'.*'
1287     IE_NAME = u'generic'
1288
1289     def __init__(self, downloader=None):
1290         InfoExtractor.__init__(self, downloader)
1291
1292     def report_download_webpage(self, video_id):
1293         """Report webpage download."""
1294         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1295         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1296
1297     def report_extraction(self, video_id):
1298         """Report information extraction."""
1299         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1300
1301     def report_following_redirect(self, new_url):
1302         """Report information extraction."""
1303         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1304         
1305     def _test_redirect(self, url):
1306         """Check if it is a redirect, like url shorteners, in case restart chain."""
1307         class HeadRequest(compat_urllib_request.Request):
1308             def get_method(self):
1309                 return "HEAD"
1310
1311         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1312             """
1313             Subclass the HTTPRedirectHandler to make it use our 
1314             HeadRequest also on the redirected URL
1315             """
1316             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1317                 if code in (301, 302, 303, 307):
1318                     newurl = newurl.replace(' ', '%20') 
1319                     newheaders = dict((k,v) for k,v in req.headers.items()
1320                                       if k.lower() not in ("content-length", "content-type"))
1321                     return HeadRequest(newurl, 
1322                                        headers=newheaders,
1323                                        origin_req_host=req.get_origin_req_host(), 
1324                                        unverifiable=True) 
1325                 else: 
1326                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1327
1328         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1329             """
1330             Fallback to GET if HEAD is not allowed (405 HTTP error)
1331             """
1332             def http_error_405(self, req, fp, code, msg, headers): 
1333                 fp.read()
1334                 fp.close()
1335
1336                 newheaders = dict((k,v) for k,v in req.headers.items()
1337                                   if k.lower() not in ("content-length", "content-type"))
1338                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1339                                                  headers=newheaders, 
1340                                                  origin_req_host=req.get_origin_req_host(), 
1341                                                  unverifiable=True))
1342
1343         # Build our opener
1344         opener = compat_urllib_request.OpenerDirector() 
1345         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1346                         HTTPMethodFallback, HEADRedirectHandler,
1347                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1348             opener.add_handler(handler())
1349
1350         response = opener.open(HeadRequest(url))
1351         new_url = response.geturl()
1352
1353         if url == new_url:
1354             return False
1355
1356         self.report_following_redirect(new_url)
1357         self._downloader.download([new_url])
1358         return True
1359
1360     def _real_extract(self, url):
1361         if self._test_redirect(url): return
1362
1363         video_id = url.split('/')[-1]
1364         request = compat_urllib_request.Request(url)
1365         try:
1366             self.report_download_webpage(video_id)
1367             webpage = compat_urllib_request.urlopen(request).read()
1368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1369             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1370             return
1371         except ValueError as err:
1372             # since this is the last-resort InfoExtractor, if
1373             # this error is thrown, it'll be thrown here
1374             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1375             return
1376
1377         self.report_extraction(video_id)
1378         # Start with something easy: JW Player in SWFObject
1379         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380         if mobj is None:
1381             # Broaden the search a little bit
1382             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383         if mobj is None:
1384             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385             return
1386
1387         # It's possible that one of the regexes
1388         # matched, but returned an empty group:
1389         if mobj.group(1) is None:
1390             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391             return
1392
1393         video_url = compat_urllib_parse.unquote(mobj.group(1))
1394         video_id = os.path.basename(video_url)
1395
1396         # here's a fun little line of code for you:
1397         video_extension = os.path.splitext(video_id)[1][1:]
1398         video_id = os.path.splitext(video_id)[0]
1399
1400         # it's tempting to parse this further, but you would
1401         # have to take into account all the variations like
1402         #   Video Title - Site Name
1403         #   Site Name | Video Title
1404         #   Video Title - Tagline | Site Name
1405         # and so on and so forth; it's just not practical
1406         mobj = re.search(r'<title>(.*)</title>', webpage)
1407         if mobj is None:
1408             self._downloader.trouble(u'ERROR: unable to extract title')
1409             return
1410         video_title = mobj.group(1).decode('utf-8')
1411
1412         # video uploader is domain name
1413         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1414         if mobj is None:
1415             self._downloader.trouble(u'ERROR: unable to extract title')
1416             return
1417         video_uploader = mobj.group(1).decode('utf-8')
1418
1419         return [{
1420             'id':       video_id.decode('utf-8'),
1421             'url':      video_url.decode('utf-8'),
1422             'uploader': video_uploader,
1423             'upload_date':  None,
1424             'title':    video_title,
1425             'ext':      video_extension.decode('utf-8'),
1426         }]
1427
1428
1429 class YoutubeSearchIE(InfoExtractor):
1430     """Information Extractor for YouTube search queries."""
1431     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1432     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1433     _max_youtube_results = 1000
1434     IE_NAME = u'youtube:search'
1435
1436     def __init__(self, downloader=None):
1437         InfoExtractor.__init__(self, downloader)
1438
1439     def report_download_page(self, query, pagenum):
1440         """Report attempt to download search page with given number."""
1441         query = query.decode(preferredencoding())
1442         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1443
1444     def _real_extract(self, query):
1445         mobj = re.match(self._VALID_URL, query)
1446         if mobj is None:
1447             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1448             return
1449
1450         prefix, query = query.split(':')
1451         prefix = prefix[8:]
1452         query = query.encode('utf-8')
1453         if prefix == '':
1454             self._download_n_results(query, 1)
1455             return
1456         elif prefix == 'all':
1457             self._download_n_results(query, self._max_youtube_results)
1458             return
1459         else:
1460             try:
1461                 n = int(prefix)
1462                 if n <= 0:
1463                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1464                     return
1465                 elif n > self._max_youtube_results:
1466                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1467                     n = self._max_youtube_results
1468                 self._download_n_results(query, n)
1469                 return
1470             except ValueError: # parsing prefix as integer fails
1471                 self._download_n_results(query, 1)
1472                 return
1473
1474     def _download_n_results(self, query, n):
1475         """Downloads a specified number of results for a query"""
1476
1477         video_ids = []
1478         pagenum = 0
1479         limit = n
1480
1481         while (50 * pagenum) < limit:
1482             self.report_download_page(query, pagenum+1)
1483             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1484             request = compat_urllib_request.Request(result_url)
1485             try:
1486                 data = compat_urllib_request.urlopen(request).read()
1487             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1488                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1489                 return
1490             api_response = json.loads(data)['data']
1491
1492             new_ids = list(video['id'] for video in api_response['items'])
1493             video_ids += new_ids
1494
1495             limit = min(n, api_response['totalItems'])
1496             pagenum += 1
1497
1498         if len(video_ids) > n:
1499             video_ids = video_ids[:n]
1500         for id in video_ids:
1501             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1502         return
1503
1504
1505 class GoogleSearchIE(InfoExtractor):
1506     """Information Extractor for Google Video search queries."""
1507     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1508     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1509     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1510     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1511     _max_google_results = 1000
1512     IE_NAME = u'video.google:search'
1513
1514     def __init__(self, downloader=None):
1515         InfoExtractor.__init__(self, downloader)
1516
1517     def report_download_page(self, query, pagenum):
1518         """Report attempt to download playlist page with given number."""
1519         query = query.decode(preferredencoding())
1520         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1521
1522     def _real_extract(self, query):
1523         mobj = re.match(self._VALID_URL, query)
1524         if mobj is None:
1525             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1526             return
1527
1528         prefix, query = query.split(':')
1529         prefix = prefix[8:]
1530         query = query.encode('utf-8')
1531         if prefix == '':
1532             self._download_n_results(query, 1)
1533             return
1534         elif prefix == 'all':
1535             self._download_n_results(query, self._max_google_results)
1536             return
1537         else:
1538             try:
1539                 n = int(prefix)
1540                 if n <= 0:
1541                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1542                     return
1543                 elif n > self._max_google_results:
1544                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1545                     n = self._max_google_results
1546                 self._download_n_results(query, n)
1547                 return
1548             except ValueError: # parsing prefix as integer fails
1549                 self._download_n_results(query, 1)
1550                 return
1551
1552     def _download_n_results(self, query, n):
1553         """Downloads a specified number of results for a query"""
1554
1555         video_ids = []
1556         pagenum = 0
1557
1558         while True:
1559             self.report_download_page(query, pagenum)
1560             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1561             request = compat_urllib_request.Request(result_url)
1562             try:
1563                 page = compat_urllib_request.urlopen(request).read()
1564             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1565                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1566                 return
1567
1568             # Extract video identifiers
1569             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1570                 video_id = mobj.group(1)
1571                 if video_id not in video_ids:
1572                     video_ids.append(video_id)
1573                     if len(video_ids) == n:
1574                         # Specified n videos reached
1575                         for id in video_ids:
1576                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577                         return
1578
1579             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1580                 for id in video_ids:
1581                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1582                 return
1583
1584             pagenum = pagenum + 1
1585
1586
1587 class YahooSearchIE(InfoExtractor):
1588     """Information Extractor for Yahoo! Video search queries."""
1589     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1590     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1591     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1592     _MORE_PAGES_INDICATOR = r'\s*Next'
1593     _max_yahoo_results = 1000
1594     IE_NAME = u'video.yahoo:search'
1595
1596     def __init__(self, downloader=None):
1597         InfoExtractor.__init__(self, downloader)
1598
1599     def report_download_page(self, query, pagenum):
1600         """Report attempt to download playlist page with given number."""
1601         query = query.decode(preferredencoding())
1602         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1603
1604     def _real_extract(self, query):
1605         mobj = re.match(self._VALID_URL, query)
1606         if mobj is None:
1607             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1608             return
1609
1610         prefix, query = query.split(':')
1611         prefix = prefix[8:]
1612         query = query.encode('utf-8')
1613         if prefix == '':
1614             self._download_n_results(query, 1)
1615             return
1616         elif prefix == 'all':
1617             self._download_n_results(query, self._max_yahoo_results)
1618             return
1619         else:
1620             try:
1621                 n = int(prefix)
1622                 if n <= 0:
1623                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1624                     return
1625                 elif n > self._max_yahoo_results:
1626                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1627                     n = self._max_yahoo_results
1628                 self._download_n_results(query, n)
1629                 return
1630             except ValueError: # parsing prefix as integer fails
1631                 self._download_n_results(query, 1)
1632                 return
1633
1634     def _download_n_results(self, query, n):
1635         """Downloads a specified number of results for a query"""
1636
1637         video_ids = []
1638         already_seen = set()
1639         pagenum = 1
1640
1641         while True:
1642             self.report_download_page(query, pagenum)
1643             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1644             request = compat_urllib_request.Request(result_url)
1645             try:
1646                 page = compat_urllib_request.urlopen(request).read()
1647             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1648                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1649                 return
1650
1651             # Extract video identifiers
1652             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1653                 video_id = mobj.group(1)
1654                 if video_id not in already_seen:
1655                     video_ids.append(video_id)
1656                     already_seen.add(video_id)
1657                     if len(video_ids) == n:
1658                         # Specified n videos reached
1659                         for id in video_ids:
1660                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661                         return
1662
1663             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1664                 for id in video_ids:
1665                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1666                 return
1667
1668             pagenum = pagenum + 1
1669
1670
1671 class YoutubePlaylistIE(InfoExtractor):
1672     """Information Extractor for YouTube playlists."""
1673
1674     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1675     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1676     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1677     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1678     IE_NAME = u'youtube:playlist'
1679
1680     def __init__(self, downloader=None):
1681         InfoExtractor.__init__(self, downloader)
1682
1683     def report_download_page(self, playlist_id, pagenum):
1684         """Report attempt to download playlist page with given number."""
1685         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1686
1687     def _real_extract(self, url):
1688         # Extract playlist id
1689         mobj = re.match(self._VALID_URL, url)
1690         if mobj is None:
1691             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1692             return
1693
1694         # Single video case
1695         if mobj.group(3) is not None:
1696             self._downloader.download([mobj.group(3)])
1697             return
1698
1699         # Download playlist pages
1700         # prefix is 'p' as default for playlists but there are other types that need extra care
1701         playlist_prefix = mobj.group(1)
1702         if playlist_prefix == 'a':
1703             playlist_access = 'artist'
1704         else:
1705             playlist_prefix = 'p'
1706             playlist_access = 'view_play_list'
1707         playlist_id = mobj.group(2)
1708         video_ids = []
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(playlist_id, pagenum)
1713             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1714             request = compat_urllib_request.Request(url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             ids_in_page = []
1723             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1724                 if mobj.group(1) not in ids_in_page:
1725                     ids_in_page.append(mobj.group(1))
1726             video_ids.extend(ids_in_page)
1727
1728             if self._MORE_PAGES_INDICATOR not in page:
1729                 break
1730             pagenum = pagenum + 1
1731
1732         total = len(video_ids)
1733
1734         playliststart = self._downloader.params.get('playliststart', 1) - 1
1735         playlistend = self._downloader.params.get('playlistend', -1)
1736         if playlistend == -1:
1737             video_ids = video_ids[playliststart:]
1738         else:
1739             video_ids = video_ids[playliststart:playlistend]
1740
1741         if len(video_ids) == total:
1742             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1743         else:
1744             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1745
1746         for id in video_ids:
1747             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1748         return
1749
1750
1751 class YoutubeChannelIE(InfoExtractor):
1752     """Information Extractor for YouTube channels."""
1753
1754     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1755     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1756     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1757     IE_NAME = u'youtube:channel'
1758
1759     def report_download_page(self, channel_id, pagenum):
1760         """Report attempt to download channel page with given number."""
1761         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1762
1763     def _real_extract(self, url):
1764         # Extract channel id
1765         mobj = re.match(self._VALID_URL, url)
1766         if mobj is None:
1767             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1768             return
1769
1770         # Download channel pages
1771         channel_id = mobj.group(1)
1772         video_ids = []
1773         pagenum = 1
1774
1775         while True:
1776             self.report_download_page(channel_id, pagenum)
1777             url = self._TEMPLATE_URL % (channel_id, pagenum)
1778             request = compat_urllib_request.Request(url)
1779             try:
1780                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1781             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1782                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1783                 return
1784
1785             # Extract video identifiers
1786             ids_in_page = []
1787             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1788                 if mobj.group(1) not in ids_in_page:
1789                     ids_in_page.append(mobj.group(1))
1790             video_ids.extend(ids_in_page)
1791
1792             if self._MORE_PAGES_INDICATOR not in page:
1793                 break
1794             pagenum = pagenum + 1
1795
1796         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1797
1798         for id in video_ids:
1799             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1800         return
1801
1802
1803 class YoutubeUserIE(InfoExtractor):
1804     """Information Extractor for YouTube users."""
1805
1806     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1807     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1808     _GDATA_PAGE_SIZE = 50
1809     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1810     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1811     IE_NAME = u'youtube:user'
1812
1813     def __init__(self, downloader=None):
1814         InfoExtractor.__init__(self, downloader)
1815
1816     def report_download_page(self, username, start_index):
1817         """Report attempt to download user page."""
1818         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1819                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1820
1821     def _real_extract(self, url):
1822         # Extract username
1823         mobj = re.match(self._VALID_URL, url)
1824         if mobj is None:
1825             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1826             return
1827
1828         username = mobj.group(1)
1829
1830         # Download video ids using YouTube Data API. Result size per
1831         # query is limited (currently to 50 videos) so we need to query
1832         # page by page until there are no video ids - it means we got
1833         # all of them.
1834
1835         video_ids = []
1836         pagenum = 0
1837
1838         while True:
1839             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1840             self.report_download_page(username, start_index)
1841
1842             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1843
1844             try:
1845                 page = compat_urllib_request.urlopen(request).read()
1846             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1847                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1848                 return
1849
1850             # Extract video identifiers
1851             ids_in_page = []
1852
1853             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1854                 if mobj.group(1) not in ids_in_page:
1855                     ids_in_page.append(mobj.group(1))
1856
1857             video_ids.extend(ids_in_page)
1858
1859             # A little optimization - if current page is not
1860             # "full", ie. does not contain PAGE_SIZE video ids then
1861             # we can assume that this page is the last one - there
1862             # are no more ids on further pages - no need to query
1863             # again.
1864
1865             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1866                 break
1867
1868             pagenum += 1
1869
1870         all_ids_count = len(video_ids)
1871         playliststart = self._downloader.params.get('playliststart', 1) - 1
1872         playlistend = self._downloader.params.get('playlistend', -1)
1873
1874         if playlistend == -1:
1875             video_ids = video_ids[playliststart:]
1876         else:
1877             video_ids = video_ids[playliststart:playlistend]
1878
1879         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1880                 (username, all_ids_count, len(video_ids)))
1881
1882         for video_id in video_ids:
1883             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1884
1885
1886 class BlipTVUserIE(InfoExtractor):
1887     """Information Extractor for blip.tv users."""
1888
1889     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1890     _PAGE_SIZE = 12
1891     IE_NAME = u'blip.tv:user'
1892
1893     def __init__(self, downloader=None):
1894         InfoExtractor.__init__(self, downloader)
1895
1896     def report_download_page(self, username, pagenum):
1897         """Report attempt to download user page."""
1898         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1899                 (self.IE_NAME, username, pagenum))
1900
1901     def _real_extract(self, url):
1902         # Extract username
1903         mobj = re.match(self._VALID_URL, url)
1904         if mobj is None:
1905             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1906             return
1907
1908         username = mobj.group(1)
1909
1910         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1911
1912         request = compat_urllib_request.Request(url)
1913
1914         try:
1915             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1916             mobj = re.search(r'data-users-id="([^"]+)"', page)
1917             page_base = page_base % mobj.group(1)
1918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1920             return
1921
1922
1923         # Download video ids using BlipTV Ajax calls. Result size per
1924         # query is limited (currently to 12 videos) so we need to query
1925         # page by page until there are no video ids - it means we got
1926         # all of them.
1927
1928         video_ids = []
1929         pagenum = 1
1930
1931         while True:
1932             self.report_download_page(username, pagenum)
1933
1934             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1935
1936             try:
1937                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1938             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1940                 return
1941
1942             # Extract video identifiers
1943             ids_in_page = []
1944
1945             for mobj in re.finditer(r'href="/([^"]+)"', page):
1946                 if mobj.group(1) not in ids_in_page:
1947                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1948
1949             video_ids.extend(ids_in_page)
1950
1951             # A little optimization - if current page is not
1952             # "full", ie. does not contain PAGE_SIZE video ids then
1953             # we can assume that this page is the last one - there
1954             # are no more ids on further pages - no need to query
1955             # again.
1956
1957             if len(ids_in_page) < self._PAGE_SIZE:
1958                 break
1959
1960             pagenum += 1
1961
1962         all_ids_count = len(video_ids)
1963         playliststart = self._downloader.params.get('playliststart', 1) - 1
1964         playlistend = self._downloader.params.get('playlistend', -1)
1965
1966         if playlistend == -1:
1967             video_ids = video_ids[playliststart:]
1968         else:
1969             video_ids = video_ids[playliststart:playlistend]
1970
1971         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1972                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1973
1974         for video_id in video_ids:
1975             self._downloader.download([u'http://blip.tv/'+video_id])
1976
1977
1978 class DepositFilesIE(InfoExtractor):
1979     """Information extractor for depositfiles.com"""
1980
1981     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1982     IE_NAME = u'DepositFiles'
1983
1984     def __init__(self, downloader=None):
1985         InfoExtractor.__init__(self, downloader)
1986
1987     def report_download_webpage(self, file_id):
1988         """Report webpage download."""
1989         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1990
1991     def report_extraction(self, file_id):
1992         """Report information extraction."""
1993         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1994
1995     def _real_extract(self, url):
1996         file_id = url.split('/')[-1]
1997         # Rebuild url in english locale
1998         url = 'http://depositfiles.com/en/files/' + file_id
1999
2000         # Retrieve file webpage with 'Free download' button pressed
2001         free_download_indication = { 'gateway_result' : '1' }
2002         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2003         try:
2004             self.report_download_webpage(file_id)
2005             webpage = compat_urllib_request.urlopen(request).read()
2006         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2007             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2008             return
2009
2010         # Search for the real file URL
2011         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2012         if (mobj is None) or (mobj.group(1) is None):
2013             # Try to figure out reason of the error.
2014             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2015             if (mobj is not None) and (mobj.group(1) is not None):
2016                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2017                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2018             else:
2019                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2020             return
2021
2022         file_url = mobj.group(1)
2023         file_extension = os.path.splitext(file_url)[1][1:]
2024
2025         # Search for file title
2026         mobj = re.search(r'<b title="(.*?)">', webpage)
2027         if mobj is None:
2028             self._downloader.trouble(u'ERROR: unable to extract title')
2029             return
2030         file_title = mobj.group(1).decode('utf-8')
2031
2032         return [{
2033             'id':       file_id.decode('utf-8'),
2034             'url':      file_url.decode('utf-8'),
2035             'uploader': None,
2036             'upload_date':  None,
2037             'title':    file_title,
2038             'ext':      file_extension.decode('utf-8'),
2039         }]
2040
2041
2042 class FacebookIE(InfoExtractor):
2043     """Information Extractor for Facebook"""
2044
2045     _WORKING = False
2046     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2047     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2048     _NETRC_MACHINE = 'facebook'
2049     _available_formats = ['video', 'highqual', 'lowqual']
2050     _video_extensions = {
2051         'video': 'mp4',
2052         'highqual': 'mp4',
2053         'lowqual': 'mp4',
2054     }
2055     IE_NAME = u'facebook'
2056
2057     def __init__(self, downloader=None):
2058         InfoExtractor.__init__(self, downloader)
2059
2060     def _reporter(self, message):
2061         """Add header and report message."""
2062         self._downloader.to_screen(u'[facebook] %s' % message)
2063
2064     def report_login(self):
2065         """Report attempt to log in."""
2066         self._reporter(u'Logging in')
2067
2068     def report_video_webpage_download(self, video_id):
2069         """Report attempt to download video webpage."""
2070         self._reporter(u'%s: Downloading video webpage' % video_id)
2071
2072     def report_information_extraction(self, video_id):
2073         """Report attempt to extract video information."""
2074         self._reporter(u'%s: Extracting video information' % video_id)
2075
2076     def _parse_page(self, video_webpage):
2077         """Extract video information from page"""
2078         # General data
2079         data = {'title': r'\("video_title", "(.*?)"\)',
2080             'description': r'<div class="datawrap">(.*?)</div>',
2081             'owner': r'\("video_owner_name", "(.*?)"\)',
2082             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2083             }
2084         video_info = {}
2085         for piece in data.keys():
2086             mobj = re.search(data[piece], video_webpage)
2087             if mobj is not None:
2088                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2089
2090         # Video urls
2091         video_urls = {}
2092         for fmt in self._available_formats:
2093             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2094             if mobj is not None:
2095                 # URL is in a Javascript segment inside an escaped Unicode format within
2096                 # the generally utf-8 page
2097                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2098         video_info['video_urls'] = video_urls
2099
2100         return video_info
2101
2102     def _real_initialize(self):
2103         if self._downloader is None:
2104             return
2105
2106         useremail = None
2107         password = None
2108         downloader_params = self._downloader.params
2109
2110         # Attempt to use provided username and password or .netrc data
2111         if downloader_params.get('username', None) is not None:
2112             useremail = downloader_params['username']
2113             password = downloader_params['password']
2114         elif downloader_params.get('usenetrc', False):
2115             try:
2116                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2117                 if info is not None:
2118                     useremail = info[0]
2119                     password = info[2]
2120                 else:
2121                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2122             except (IOError, netrc.NetrcParseError) as err:
2123                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2124                 return
2125
2126         if useremail is None:
2127             return
2128
2129         # Log in
2130         login_form = {
2131             'email': useremail,
2132             'pass': password,
2133             'login': 'Log+In'
2134             }
2135         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2136         try:
2137             self.report_login()
2138             login_results = compat_urllib_request.urlopen(request).read()
2139             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2140                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2141                 return
2142         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2143             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2144             return
2145
2146     def _real_extract(self, url):
2147         mobj = re.match(self._VALID_URL, url)
2148         if mobj is None:
2149             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2150             return
2151         video_id = mobj.group('ID')
2152
2153         # Get video webpage
2154         self.report_video_webpage_download(video_id)
2155         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2156         try:
2157             page = compat_urllib_request.urlopen(request)
2158             video_webpage = page.read()
2159         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2160             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2161             return
2162
2163         # Start extracting information
2164         self.report_information_extraction(video_id)
2165
2166         # Extract information
2167         video_info = self._parse_page(video_webpage)
2168
2169         # uploader
2170         if 'owner' not in video_info:
2171             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2172             return
2173         video_uploader = video_info['owner']
2174
2175         # title
2176         if 'title' not in video_info:
2177             self._downloader.trouble(u'ERROR: unable to extract video title')
2178             return
2179         video_title = video_info['title']
2180         video_title = video_title.decode('utf-8')
2181
2182         # thumbnail image
2183         if 'thumbnail' not in video_info:
2184             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2185             video_thumbnail = ''
2186         else:
2187             video_thumbnail = video_info['thumbnail']
2188
2189         # upload date
2190         upload_date = None
2191         if 'upload_date' in video_info:
2192             upload_time = video_info['upload_date']
2193             timetuple = email.utils.parsedate_tz(upload_time)
2194             if timetuple is not None:
2195                 try:
2196                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2197                 except:
2198                     pass
2199
2200         # description
2201         video_description = video_info.get('description', 'No description available.')
2202
2203         url_map = video_info['video_urls']
2204         if len(url_map.keys()) > 0:
2205             # Decide which formats to download
2206             req_format = self._downloader.params.get('format', None)
2207             format_limit = self._downloader.params.get('format_limit', None)
2208
2209             if format_limit is not None and format_limit in self._available_formats:
2210                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2211             else:
2212                 format_list = self._available_formats
2213             existing_formats = [x for x in format_list if x in url_map]
2214             if len(existing_formats) == 0:
2215                 self._downloader.trouble(u'ERROR: no known formats available for video')
2216                 return
2217             if req_format is None:
2218                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2219             elif req_format == 'worst':
2220                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2221             elif req_format == '-1':
2222                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2223             else:
2224                 # Specific format
2225                 if req_format not in url_map:
2226                     self._downloader.trouble(u'ERROR: requested format not available')
2227                     return
2228                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2229
2230         results = []
2231         for format_param, video_real_url in video_url_list:
2232             # Extension
2233             video_extension = self._video_extensions.get(format_param, 'mp4')
2234
2235             results.append({
2236                 'id':       video_id.decode('utf-8'),
2237                 'url':      video_real_url.decode('utf-8'),
2238                 'uploader': video_uploader.decode('utf-8'),
2239                 'upload_date':  upload_date,
2240                 'title':    video_title,
2241                 'ext':      video_extension.decode('utf-8'),
2242                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2243                 'thumbnail':    video_thumbnail.decode('utf-8'),
2244                 'description':  video_description.decode('utf-8'),
2245             })
2246         return results
2247
2248 class BlipTVIE(InfoExtractor):
2249     """Information extractor for blip.tv"""
2250
2251     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2252     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2253     IE_NAME = u'blip.tv'
2254
2255     def report_extraction(self, file_id):
2256         """Report information extraction."""
2257         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2258
2259     def report_direct_download(self, title):
2260         """Report information extraction."""
2261         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2262
2263     def _real_extract(self, url):
2264         mobj = re.match(self._VALID_URL, url)
2265         if mobj is None:
2266             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2267             return
2268
2269         if '?' in url:
2270             cchar = '&'
2271         else:
2272             cchar = '?'
2273         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2274         request = compat_urllib_request.Request(json_url)
2275         self.report_extraction(mobj.group(1))
2276         info = None
2277         try:
2278             urlh = compat_urllib_request.urlopen(request)
2279             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2280                 basename = url.split('/')[-1]
2281                 title,ext = os.path.splitext(basename)
2282                 title = title.decode('UTF-8')
2283                 ext = ext.replace('.', '')
2284                 self.report_direct_download(title)
2285                 info = {
2286                     'id': title,
2287                     'url': url,
2288                     'uploader': None,
2289                     'upload_date': None,
2290                     'title': title,
2291                     'ext': ext,
2292                     'urlhandle': urlh
2293                 }
2294         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2295             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2296             return
2297         if info is None: # Regular URL
2298             try:
2299                 json_code_bytes = urlh.read()
2300                 json_code = json_code_bytes.decode('utf-8')
2301             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2302                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2303                 return
2304
2305             try:
2306                 json_data = json.loads(json_code)
2307                 if 'Post' in json_data:
2308                     data = json_data['Post']
2309                 else:
2310                     data = json_data
2311
2312                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2313                 video_url = data['media']['url']
2314                 umobj = re.match(self._URL_EXT, video_url)
2315                 if umobj is None:
2316                     raise ValueError('Can not determine filename extension')
2317                 ext = umobj.group(1)
2318
2319                 info = {
2320                     'id': data['item_id'],
2321                     'url': video_url,
2322                     'uploader': data['display_name'],
2323                     'upload_date': upload_date,
2324                     'title': data['title'],
2325                     'ext': ext,
2326                     'format': data['media']['mimeType'],
2327                     'thumbnail': data['thumbnailUrl'],
2328                     'description': data['description'],
2329                     'player_url': data['embedUrl']
2330                 }
2331             except (ValueError,KeyError) as err:
2332                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2333                 return
2334
2335         std_headers['User-Agent'] = 'iTunes/10.6.1'
2336         return [info]
2337
2338
2339 class MyVideoIE(InfoExtractor):
2340     """Information Extractor for myvideo.de."""
2341
2342     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2343     IE_NAME = u'myvideo'
2344
2345     def __init__(self, downloader=None):
2346         InfoExtractor.__init__(self, downloader)
2347     
2348     def report_download_webpage(self, video_id):
2349         """Report webpage download."""
2350         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2351
2352     def report_extraction(self, video_id):
2353         """Report information extraction."""
2354         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2355
2356     def _real_extract(self,url):
2357         mobj = re.match(self._VALID_URL, url)
2358         if mobj is None:
2359             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2360             return
2361
2362         video_id = mobj.group(1)
2363
2364         # Get video webpage
2365         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2366         try:
2367             self.report_download_webpage(video_id)
2368             webpage = compat_urllib_request.urlopen(request).read()
2369         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2370             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2371             return
2372
2373         self.report_extraction(video_id)
2374         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2375                  webpage)
2376         if mobj is None:
2377             self._downloader.trouble(u'ERROR: unable to extract media URL')
2378             return
2379         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2380
2381         mobj = re.search('<title>([^<]+)</title>', webpage)
2382         if mobj is None:
2383             self._downloader.trouble(u'ERROR: unable to extract title')
2384             return
2385
2386         video_title = mobj.group(1)
2387
2388         return [{
2389             'id':       video_id,
2390             'url':      video_url,
2391             'uploader': None,
2392             'upload_date':  None,
2393             'title':    video_title,
2394             'ext':      u'flv',
2395         }]
2396
2397 class ComedyCentralIE(InfoExtractor):
2398     """Information extractor for The Daily Show and Colbert Report """
2399
2400     # urls can be abbreviations like :thedailyshow or :colbert
2401     # urls for episodes like: 
2402     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2403     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2404     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524    
2405     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2406                       |(https?://)?(www\.)?
2407                           (?P<showname>thedailyshow|colbertnation)\.com/
2408                          (full-episodes/(?P<episode>.*)|
2409                           (?P<clip>
2410                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2411                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2412                      $"""                        
2413     IE_NAME = u'comedycentral'
2414
2415     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2416
2417     _video_extensions = {
2418         '3500': 'mp4',
2419         '2200': 'mp4',
2420         '1700': 'mp4',
2421         '1200': 'mp4',
2422         '750': 'mp4',
2423         '400': 'mp4',
2424     }
2425     _video_dimensions = {
2426         '3500': '1280x720',
2427         '2200': '960x540',
2428         '1700': '768x432',
2429         '1200': '640x360',
2430         '750': '512x288',
2431         '400': '384x216',
2432     }
2433
2434     def suitable(self, url):
2435         """Receives a URL and returns True if suitable for this IE."""
2436         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2437
2438     def report_extraction(self, episode_id):
2439         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2440
2441     def report_config_download(self, episode_id):
2442         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2443
2444     def report_index_download(self, episode_id):
2445         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2446
2447     def report_player_url(self, episode_id):
2448         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2449
2450
2451     def _print_formats(self, formats):
2452         print('Available formats:')
2453         for x in formats:
2454             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2455
2456
2457     def _real_extract(self, url):
2458         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2459         if mobj is None:
2460             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2461             return
2462
2463         if mobj.group('shortname'):
2464             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2465                 url = u'http://www.thedailyshow.com/full-episodes/'
2466             else:
2467                 url = u'http://www.colbertnation.com/full-episodes/'
2468             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2469             assert mobj is not None
2470
2471         if mobj.group('clip'):
2472             if mobj.group('showname') == 'thedailyshow':
2473                 epTitle = mobj.group('tdstitle')
2474             else:
2475                 epTitle = mobj.group('cntitle')
2476             dlNewest = False
2477         else:
2478             dlNewest = not mobj.group('episode')
2479             if dlNewest:
2480                 epTitle = mobj.group('showname')
2481             else:
2482                 epTitle = mobj.group('episode')
2483
2484         req = compat_urllib_request.Request(url)
2485         self.report_extraction(epTitle)
2486         try:
2487             htmlHandle = compat_urllib_request.urlopen(req)
2488             html = htmlHandle.read()
2489         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2491             return
2492         if dlNewest:
2493             url = htmlHandle.geturl()
2494             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2495             if mobj is None:
2496                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2497                 return
2498             if mobj.group('episode') == '':
2499                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2500                 return
2501             epTitle = mobj.group('episode')
2502
2503         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2504
2505         if len(mMovieParams) == 0:
2506             # The Colbert Report embeds the information in a without
2507             # a URL prefix; so extract the alternate reference
2508             # and then add the URL prefix manually.
2509
2510             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2511             if len(altMovieParams) == 0:
2512                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2513                 return
2514             else:
2515                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2516         
2517         playerUrl_raw = mMovieParams[0][0]
2518         self.report_player_url(epTitle)
2519         try:
2520             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2521             playerUrl = urlHandle.geturl()
2522         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2524             return
2525
2526         uri = mMovieParams[0][1]
2527         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2528         self.report_index_download(epTitle)
2529         try:
2530             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2533             return
2534
2535         results = []
2536
2537         idoc = xml.etree.ElementTree.fromstring(indexXml)
2538         itemEls = idoc.findall('.//item')
2539         for itemEl in itemEls:
2540             mediaId = itemEl.findall('./guid')[0].text
2541             shortMediaId = mediaId.split(':')[-1]
2542             showId = mediaId.split(':')[-2].replace('.com', '')
2543             officialTitle = itemEl.findall('./title')[0].text
2544             officialDate = itemEl.findall('./pubDate')[0].text
2545
2546             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2547                         compat_urllib_parse.urlencode({'uri': mediaId}))
2548             configReq = compat_urllib_request.Request(configUrl)
2549             self.report_config_download(epTitle)
2550             try:
2551                 configXml = compat_urllib_request.urlopen(configReq).read()
2552             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2553                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2554                 return
2555
2556             cdoc = xml.etree.ElementTree.fromstring(configXml)
2557             turls = []
2558             for rendition in cdoc.findall('.//rendition'):
2559                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2560                 turls.append(finfo)
2561
2562             if len(turls) == 0:
2563                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2564                 continue
2565             
2566             if self._downloader.params.get('listformats', None):
2567                 self._print_formats([i[0] for i in turls])
2568                 return
2569
2570             # For now, just pick the highest bitrate
2571             format,video_url = turls[-1]
2572
2573             # Get the format arg from the arg stream
2574             req_format = self._downloader.params.get('format', None)
2575
2576             # Select format if we can find one
2577             for f,v in turls:
2578                 if f == req_format:
2579                     format, video_url = f, v
2580                     break
2581
2582             # Patch to download from alternative CDN, which does not
2583             # break on current RTMPDump builds
2584             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2585             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2586
2587             if video_url.startswith(broken_cdn):
2588                 video_url = video_url.replace(broken_cdn, better_cdn)
2589
2590             effTitle = showId + u'-' + epTitle
2591             info = {
2592                 'id': shortMediaId,
2593                 'url': video_url,
2594                 'uploader': showId,
2595                 'upload_date': officialDate,
2596                 'title': effTitle,
2597                 'ext': 'mp4',
2598                 'format': format,
2599                 'thumbnail': None,
2600                 'description': officialTitle,
2601                 'player_url': None #playerUrl
2602             }
2603
2604             results.append(info)
2605             
2606         return results
2607
2608
2609 class EscapistIE(InfoExtractor):
2610     """Information extractor for The Escapist """
2611
2612     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2613     IE_NAME = u'escapist'
2614
2615     def report_extraction(self, showName):
2616         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2617
2618     def report_config_download(self, showName):
2619         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2620
2621     def _real_extract(self, url):
2622         mobj = re.match(self._VALID_URL, url)
2623         if mobj is None:
2624             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2625             return
2626         showName = mobj.group('showname')
2627         videoId = mobj.group('episode')
2628
2629         self.report_extraction(showName)
2630         try:
2631             webPage = compat_urllib_request.urlopen(url)
2632             webPageBytes = webPage.read()
2633             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2634             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2636             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2637             return
2638
2639         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2640         description = unescapeHTML(descMatch.group(1))
2641         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2642         imgUrl = unescapeHTML(imgMatch.group(1))
2643         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2644         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2645         configUrlMatch = re.search('config=(.*)$', playerUrl)
2646         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2647
2648         self.report_config_download(showName)
2649         try:
2650             configJSON = compat_urllib_request.urlopen(configUrl).read()
2651         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2652             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2653             return
2654
2655         # Technically, it's JavaScript, not JSON
2656         configJSON = configJSON.replace("'", '"')
2657
2658         try:
2659             config = json.loads(configJSON)
2660         except (ValueError,) as err:
2661             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2662             return
2663
2664         playlist = config['playlist']
2665         videoUrl = playlist[1]['url']
2666
2667         info = {
2668             'id': videoId,
2669             'url': videoUrl,
2670             'uploader': showName,
2671             'upload_date': None,
2672             'title': showName,
2673             'ext': 'flv',
2674             'thumbnail': imgUrl,
2675             'description': description,
2676             'player_url': playerUrl,
2677         }
2678
2679         return [info]
2680
2681
2682 class CollegeHumorIE(InfoExtractor):
2683     """Information extractor for collegehumor.com"""
2684
2685     _WORKING = False
2686     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2687     IE_NAME = u'collegehumor'
2688
2689     def report_manifest(self, video_id):
2690         """Report information extraction."""
2691         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2692
2693     def report_extraction(self, video_id):
2694         """Report information extraction."""
2695         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2696
2697     def _real_extract(self, url):
2698         mobj = re.match(self._VALID_URL, url)
2699         if mobj is None:
2700             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701             return
2702         video_id = mobj.group('videoid')
2703
2704         info = {
2705             'id': video_id,
2706             'uploader': None,
2707             'upload_date': None,
2708         }
2709
2710         self.report_extraction(video_id)
2711         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2712         try:
2713             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2716             return
2717
2718         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2719         try:
2720             videoNode = mdoc.findall('./video')[0]
2721             info['description'] = videoNode.findall('./description')[0].text
2722             info['title'] = videoNode.findall('./caption')[0].text
2723             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2724             manifest_url = videoNode.findall('./file')[0].text
2725         except IndexError:
2726             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2727             return
2728
2729         manifest_url += '?hdcore=2.10.3'
2730         self.report_manifest(video_id)
2731         try:
2732             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2733         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2734             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2735             return
2736
2737         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2738         try:
2739             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2740             node_id = media_node.attrib['url']
2741             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2742         except IndexError as err:
2743             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2744             return
2745
2746         url_pr = compat_urllib_parse_urlparse(manifest_url)
2747         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2748
2749         info['url'] = url
2750         info['ext'] = 'f4f'
2751         return [info]
2752
2753
2754 class XVideosIE(InfoExtractor):
2755     """Information extractor for xvideos.com"""
2756
2757     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2758     IE_NAME = u'xvideos'
2759
2760     def report_webpage(self, video_id):
2761         """Report information extraction."""
2762         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2763
2764     def report_extraction(self, video_id):
2765         """Report information extraction."""
2766         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2767
2768     def _real_extract(self, url):
2769         mobj = re.match(self._VALID_URL, url)
2770         if mobj is None:
2771             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2772             return
2773         video_id = mobj.group(1).decode('utf-8')
2774
2775         self.report_webpage(video_id)
2776
2777         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2778         try:
2779             webpage = compat_urllib_request.urlopen(request).read()
2780         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2781             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2782             return
2783
2784         self.report_extraction(video_id)
2785
2786
2787         # Extract video URL
2788         mobj = re.search(r'flv_url=(.+?)&', webpage)
2789         if mobj is None:
2790             self._downloader.trouble(u'ERROR: unable to extract video url')
2791             return
2792         video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2793
2794
2795         # Extract title
2796         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2797         if mobj is None:
2798             self._downloader.trouble(u'ERROR: unable to extract video title')
2799             return
2800         video_title = mobj.group(1).decode('utf-8')
2801
2802
2803         # Extract video thumbnail
2804         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2805         if mobj is None:
2806             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2807             return
2808         video_thumbnail = mobj.group(0).decode('utf-8')
2809
2810         info = {
2811             'id': video_id,
2812             'url': video_url,
2813             'uploader': None,
2814             'upload_date': None,
2815             'title': video_title,
2816             'ext': 'flv',
2817             'thumbnail': video_thumbnail,
2818             'description': None,
2819         }
2820
2821         return [info]
2822
2823
2824 class SoundcloudIE(InfoExtractor):
2825     """Information extractor for soundcloud.com
2826        To access the media, the uid of the song and a stream token
2827        must be extracted from the page source and the script must make
2828        a request to media.soundcloud.com/crossdomain.xml. Then
2829        the media can be grabbed by requesting from an url composed
2830        of the stream token and uid
2831      """
2832
2833     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834     IE_NAME = u'soundcloud'
2835
2836     def __init__(self, downloader=None):
2837         InfoExtractor.__init__(self, downloader)
2838
2839     def report_resolve(self, video_id):
2840         """Report information extraction."""
2841         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2842
2843     def report_extraction(self, video_id):
2844         """Report information extraction."""
2845         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2846
2847     def _real_extract(self, url):
2848         mobj = re.match(self._VALID_URL, url)
2849         if mobj is None:
2850             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851             return
2852
2853         # extract uploader (which is in the url)
2854         uploader = mobj.group(1)
2855         # extract simple title (uploader + slug of song title)
2856         slug_title =  mobj.group(2)
2857         simple_title = uploader + u'-' + slug_title
2858
2859         self.report_resolve('%s/%s' % (uploader, slug_title))
2860
2861         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2862         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2863         request = compat_urllib_request.Request(resolv_url)
2864         try:
2865             info_json_bytes = compat_urllib_request.urlopen(request).read()
2866             info_json = info_json_bytes.decode('utf-8')
2867         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2868             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2869             return
2870
2871         info = json.loads(info_json)
2872         video_id = info['id']
2873         self.report_extraction('%s/%s' % (uploader, slug_title))
2874
2875         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2876         request = compat_urllib_request.Request(streams_url)
2877         try:
2878             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2879             stream_json = stream_json_bytes.decode('utf-8')
2880         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2881             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2882             return
2883
2884         streams = json.loads(stream_json)
2885         mediaURL = streams['http_mp3_128_url']
2886
2887         return [{
2888             'id':       info['id'],
2889             'url':      mediaURL,
2890             'uploader': info['user']['username'],
2891             'upload_date':  info['created_at'],
2892             'title':    info['title'],
2893             'ext':      u'mp3',
2894             'description': info['description'],
2895         }]
2896
2897
2898 class InfoQIE(InfoExtractor):
2899     """Information extractor for infoq.com"""
2900
2901     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2902     IE_NAME = u'infoq'
2903
2904     def report_webpage(self, video_id):
2905         """Report information extraction."""
2906         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2907
2908     def report_extraction(self, video_id):
2909         """Report information extraction."""
2910         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2911
2912     def _real_extract(self, url):
2913         mobj = re.match(self._VALID_URL, url)
2914         if mobj is None:
2915             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2916             return
2917
2918         self.report_webpage(url)
2919
2920         request = compat_urllib_request.Request(url)
2921         try:
2922             webpage = compat_urllib_request.urlopen(request).read()
2923         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2925             return
2926
2927         self.report_extraction(url)
2928
2929
2930         # Extract video URL
2931         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2932         if mobj is None:
2933             self._downloader.trouble(u'ERROR: unable to extract video url')
2934             return
2935         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2936
2937
2938         # Extract title
2939         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2940         if mobj is None:
2941             self._downloader.trouble(u'ERROR: unable to extract video title')
2942             return
2943         video_title = mobj.group(1).decode('utf-8')
2944
2945         # Extract description
2946         video_description = u'No description available.'
2947         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2948         if mobj is not None:
2949             video_description = mobj.group(1).decode('utf-8')
2950
2951         video_filename = video_url.split('/')[-1]
2952         video_id, extension = video_filename.split('.')
2953
2954         info = {
2955             'id': video_id,
2956             'url': video_url,
2957             'uploader': None,
2958             'upload_date': None,
2959             'title': video_title,
2960             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2961             'thumbnail': None,
2962             'description': video_description,
2963         }
2964
2965         return [info]
2966
2967 class MixcloudIE(InfoExtractor):
2968     """Information extractor for www.mixcloud.com"""
2969     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2970     IE_NAME = u'mixcloud'
2971
2972     def __init__(self, downloader=None):
2973         InfoExtractor.__init__(self, downloader)
2974
2975     def report_download_json(self, file_id):
2976         """Report JSON download."""
2977         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2978
2979     def report_extraction(self, file_id):
2980         """Report information extraction."""
2981         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2982
2983     def get_urls(self, jsonData, fmt, bitrate='best'):
2984         """Get urls from 'audio_formats' section in json"""
2985         file_url = None
2986         try:
2987             bitrate_list = jsonData[fmt]
2988             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2989                 bitrate = max(bitrate_list) # select highest
2990
2991             url_list = jsonData[fmt][bitrate]
2992         except TypeError: # we have no bitrate info.
2993             url_list = jsonData[fmt]
2994         return url_list
2995
2996     def check_urls(self, url_list):
2997         """Returns 1st active url from list"""
2998         for url in url_list:
2999             try:
3000                 compat_urllib_request.urlopen(url)
3001                 return url
3002             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3003                 url = None
3004
3005         return None
3006
3007     def _print_formats(self, formats):
3008         print('Available formats:')
3009         for fmt in formats.keys():
3010             for b in formats[fmt]:
3011                 try:
3012                     ext = formats[fmt][b][0]
3013                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3014                 except TypeError: # we have no bitrate info
3015                     ext = formats[fmt][0]
3016                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3017                     break
3018
3019     def _real_extract(self, url):
3020         mobj = re.match(self._VALID_URL, url)
3021         if mobj is None:
3022             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3023             return
3024         # extract uploader & filename from url
3025         uploader = mobj.group(1).decode('utf-8')
3026         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3027
3028         # construct API request
3029         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3030         # retrieve .json file with links to files
3031         request = compat_urllib_request.Request(file_url)
3032         try:
3033             self.report_download_json(file_url)
3034             jsonData = compat_urllib_request.urlopen(request).read()
3035         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3036             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3037             return
3038
3039         # parse JSON
3040         json_data = json.loads(jsonData)
3041         player_url = json_data['player_swf_url']
3042         formats = dict(json_data['audio_formats'])
3043
3044         req_format = self._downloader.params.get('format', None)
3045         bitrate = None
3046
3047         if self._downloader.params.get('listformats', None):
3048             self._print_formats(formats)
3049             return
3050
3051         if req_format is None or req_format == 'best':
3052             for format_param in formats.keys():
3053                 url_list = self.get_urls(formats, format_param)
3054                 # check urls
3055                 file_url = self.check_urls(url_list)
3056                 if file_url is not None:
3057                     break # got it!
3058         else:
3059             if req_format not in formats.keys():
3060                 self._downloader.trouble(u'ERROR: format is not available')
3061                 return
3062
3063             url_list = self.get_urls(formats, req_format)
3064             file_url = self.check_urls(url_list)
3065             format_param = req_format
3066
3067         return [{
3068             'id': file_id.decode('utf-8'),
3069             'url': file_url.decode('utf-8'),
3070             'uploader': uploader.decode('utf-8'),
3071             'upload_date': None,
3072             'title': json_data['name'],
3073             'ext': file_url.split('.')[-1].decode('utf-8'),
3074             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3075             'thumbnail': json_data['thumbnail_url'],
3076             'description': json_data['description'],
3077             'player_url': player_url.decode('utf-8'),
3078         }]
3079
3080 class StanfordOpenClassroomIE(InfoExtractor):
3081     """Information extractor for Stanford's Open ClassRoom"""
3082
3083     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3084     IE_NAME = u'stanfordoc'
3085
3086     def report_download_webpage(self, objid):
3087         """Report information extraction."""
3088         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3089
3090     def report_extraction(self, video_id):
3091         """Report information extraction."""
3092         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3093
3094     def _real_extract(self, url):
3095         mobj = re.match(self._VALID_URL, url)
3096         if mobj is None:
3097             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3098             return
3099
3100         if mobj.group('course') and mobj.group('video'): # A specific video
3101             course = mobj.group('course')
3102             video = mobj.group('video')
3103             info = {
3104                 'id': course + '_' + video,
3105                 'uploader': None,
3106                 'upload_date': None,
3107             }
3108
3109             self.report_extraction(info['id'])
3110             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3111             xmlUrl = baseUrl + video + '.xml'
3112             try:
3113                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3114             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3116                 return
3117             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3118             try:
3119                 info['title'] = mdoc.findall('./title')[0].text
3120                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3121             except IndexError:
3122                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3123                 return
3124             info['ext'] = info['url'].rpartition('.')[2]
3125             return [info]
3126         elif mobj.group('course'): # A course page
3127             course = mobj.group('course')
3128             info = {
3129                 'id': course,
3130                 'type': 'playlist',
3131                 'uploader': None,
3132                 'upload_date': None,
3133             }
3134
3135             self.report_download_webpage(info['id'])
3136             try:
3137                 coursepage = compat_urllib_request.urlopen(url).read()
3138             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3139                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3140                 return
3141
3142             m = re.search('<h1>([^<]+)</h1>', coursepage)
3143             if m:
3144                 info['title'] = unescapeHTML(m.group(1))
3145             else:
3146                 info['title'] = info['id']
3147
3148             m = re.search('<description>([^<]+)</description>', coursepage)
3149             if m:
3150                 info['description'] = unescapeHTML(m.group(1))
3151
3152             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3153             info['list'] = [
3154                 {
3155                     'type': 'reference',
3156                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3157                 }
3158                     for vpage in links]
3159             results = []
3160             for entry in info['list']:
3161                 assert entry['type'] == 'reference'
3162                 results += self.extract(entry['url'])
3163             return results
3164             
3165         else: # Root page
3166             info = {
3167                 'id': 'Stanford OpenClassroom',
3168                 'type': 'playlist',
3169                 'uploader': None,
3170                 'upload_date': None,
3171             }
3172
3173             self.report_download_webpage(info['id'])
3174             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3175             try:
3176                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3177             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3178                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3179                 return
3180
3181             info['title'] = info['id']
3182
3183             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3184             info['list'] = [
3185                 {
3186                     'type': 'reference',
3187                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3188                 }
3189                     for cpage in links]
3190
3191             results = []
3192             for entry in info['list']:
3193                 assert entry['type'] == 'reference'
3194                 results += self.extract(entry['url'])
3195             return results
3196
3197 class MTVIE(InfoExtractor):
3198     """Information extractor for MTV.com"""
3199
3200     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3201     IE_NAME = u'mtv'
3202
3203     def report_webpage(self, video_id):
3204         """Report information extraction."""
3205         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3206
3207     def report_extraction(self, video_id):
3208         """Report information extraction."""
3209         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3210
3211     def _real_extract(self, url):
3212         mobj = re.match(self._VALID_URL, url)
3213         if mobj is None:
3214             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3215             return
3216         if not mobj.group('proto'):
3217             url = 'http://' + url
3218         video_id = mobj.group('videoid')
3219         self.report_webpage(video_id)
3220
3221         request = compat_urllib_request.Request(url)
3222         try:
3223             webpage = compat_urllib_request.urlopen(request).read()
3224         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3225             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3226             return
3227
3228         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3229         if mobj is None:
3230             self._downloader.trouble(u'ERROR: unable to extract song name')
3231             return
3232         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3233         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3234         if mobj is None:
3235             self._downloader.trouble(u'ERROR: unable to extract performer')
3236             return
3237         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3238         video_title = performer + ' - ' + song_name 
3239
3240         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3241         if mobj is None:
3242             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3243             return
3244         mtvn_uri = mobj.group(1)
3245
3246         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3247         if mobj is None:
3248             self._downloader.trouble(u'ERROR: unable to extract content id')
3249             return
3250         content_id = mobj.group(1)
3251
3252         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3253         self.report_extraction(video_id)
3254         request = compat_urllib_request.Request(videogen_url)
3255         try:
3256             metadataXml = compat_urllib_request.urlopen(request).read()
3257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3258             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3259             return
3260
3261         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3262         renditions = mdoc.findall('.//rendition')
3263
3264         # For now, always pick the highest quality.
3265         rendition = renditions[-1]
3266
3267         try:
3268             _,_,ext = rendition.attrib['type'].partition('/')
3269             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3270             video_url = rendition.find('./src').text
3271         except KeyError:
3272             self._downloader.trouble('Invalid rendition field.')
3273             return
3274
3275         info = {
3276             'id': video_id,
3277             'url': video_url,
3278             'uploader': performer,
3279             'upload_date': None,
3280             'title': video_title,
3281             'ext': ext,
3282             'format': format,
3283         }
3284
3285         return [info]
3286
3287
3288 class YoukuIE(InfoExtractor):
3289
3290     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3291     IE_NAME = u'Youku'
3292
3293     def __init__(self, downloader=None):
3294         InfoExtractor.__init__(self, downloader)
3295
3296     def report_download_webpage(self, file_id):
3297         """Report webpage download."""
3298         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3299
3300     def report_extraction(self, file_id):
3301         """Report information extraction."""
3302         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3303
3304     def _gen_sid(self):
3305         nowTime = int(time.time() * 1000)
3306         random1 = random.randint(1000,1998)
3307         random2 = random.randint(1000,9999)
3308
3309         return "%d%d%d" %(nowTime,random1,random2)
3310
3311     def _get_file_ID_mix_string(self, seed):
3312         mixed = []
3313         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3314         seed = float(seed)
3315         for i in range(len(source)):
3316             seed  =  (seed * 211 + 30031 ) % 65536
3317             index  =  math.floor(seed / 65536 * len(source) )
3318             mixed.append(source[int(index)])
3319             source.remove(source[int(index)])
3320         #return ''.join(mixed)
3321         return mixed
3322
3323     def _get_file_id(self, fileId, seed):
3324         mixed = self._get_file_ID_mix_string(seed)
3325         ids = fileId.split('*')
3326         realId = []
3327         for ch in ids:
3328             if ch:
3329                 realId.append(mixed[int(ch)])
3330         return ''.join(realId)
3331
3332     def _real_extract(self, url):
3333         mobj = re.match(self._VALID_URL, url)
3334         if mobj is None:
3335             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3336             return
3337         video_id = mobj.group('ID')
3338
3339         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3340
3341         request = compat_urllib_request.Request(info_url, None, std_headers)
3342         try:
3343             self.report_download_webpage(video_id)
3344             jsondata = compat_urllib_request.urlopen(request).read()
3345         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3346             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3347             return
3348
3349         self.report_extraction(video_id)
3350         try:
3351             config = json.loads(jsondata)
3352
3353             video_title =  config['data'][0]['title']
3354             seed = config['data'][0]['seed']
3355
3356             format = self._downloader.params.get('format', None)
3357             supported_format = config['data'][0]['streamfileids'].keys()
3358
3359             if format is None or format == 'best':
3360                 if 'hd2' in supported_format:
3361                     format = 'hd2'
3362                 else:
3363                     format = 'flv'
3364                 ext = u'flv'
3365             elif format == 'worst':
3366                 format = 'mp4'
3367                 ext = u'mp4'
3368             else:
3369                 format = 'flv'
3370                 ext = u'flv'
3371
3372
3373             fileid = config['data'][0]['streamfileids'][format]
3374             seg_number = len(config['data'][0]['segs'][format])
3375
3376             keys=[]
3377             for i in xrange(seg_number):
3378                 keys.append(config['data'][0]['segs'][format][i]['k'])
3379
3380             #TODO check error
3381             #youku only could be viewed from mainland china
3382         except:
3383             self._downloader.trouble(u'ERROR: unable to extract info section')
3384             return
3385
3386         files_info=[]
3387         sid = self._gen_sid()
3388         fileid = self._get_file_id(fileid, seed)
3389
3390         #column 8,9 of fileid represent the segment number
3391         #fileid[7:9] should be changed
3392         for index, key in enumerate(keys):
3393
3394             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3395             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3396
3397             info = {
3398                 'id': '%s_part%02d' % (video_id, index),
3399                 'url': download_url,
3400                 'uploader': None,
3401                 'upload_date': None,
3402                 'title': video_title,
3403                 'ext': ext,
3404             }
3405             files_info.append(info)
3406
3407         return files_info
3408
3409
3410 class XNXXIE(InfoExtractor):
3411     """Information extractor for xnxx.com"""
3412
3413     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3414     IE_NAME = u'xnxx'
3415     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3416     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3417     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3418
3419     def report_webpage(self, video_id):
3420         """Report information extraction"""
3421         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3422
3423     def report_extraction(self, video_id):
3424         """Report information extraction"""
3425         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3426
3427     def _real_extract(self, url):
3428         mobj = re.match(self._VALID_URL, url)
3429         if mobj is None:
3430             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431             return
3432         video_id = mobj.group(1).decode('utf-8')
3433
3434         self.report_webpage(video_id)
3435
3436         # Get webpage content
3437         try:
3438             webpage = compat_urllib_request.urlopen(url).read()
3439         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3440             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3441             return
3442
3443         result = re.search(self.VIDEO_URL_RE, webpage)
3444         if result is None:
3445             self._downloader.trouble(u'ERROR: unable to extract video url')
3446             return
3447         video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3448
3449         result = re.search(self.VIDEO_TITLE_RE, webpage)
3450         if result is None:
3451             self._downloader.trouble(u'ERROR: unable to extract video title')
3452             return
3453         video_title = result.group(1).decode('utf-8')
3454
3455         result = re.search(self.VIDEO_THUMB_RE, webpage)
3456         if result is None:
3457             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3458             return
3459         video_thumbnail = result.group(1).decode('utf-8')
3460
3461         return [{
3462             'id': video_id,
3463             'url': video_url,
3464             'uploader': None,
3465             'upload_date': None,
3466             'title': video_title,
3467             'ext': 'flv',
3468             'thumbnail': video_thumbnail,
3469             'description': None,
3470         }]
3471
3472
3473 class GooglePlusIE(InfoExtractor):
3474     """Information extractor for plus.google.com."""
3475
3476     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3477     IE_NAME = u'plus.google'
3478
3479     def __init__(self, downloader=None):
3480         InfoExtractor.__init__(self, downloader)
3481
3482     def report_extract_entry(self, url):
3483         """Report downloading extry"""
3484         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3485
3486     def report_date(self, upload_date):
3487         """Report downloading extry"""
3488         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3489
3490     def report_uploader(self, uploader):
3491         """Report downloading extry"""
3492         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3493
3494     def report_title(self, video_title):
3495         """Report downloading extry"""
3496         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3497
3498     def report_extract_vid_page(self, video_page):
3499         """Report information extraction."""
3500         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3501
3502     def _real_extract(self, url):
3503         # Extract id from URL
3504         mobj = re.match(self._VALID_URL, url)
3505         if mobj is None:
3506             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3507             return
3508
3509         post_url = mobj.group(0)
3510         video_id = mobj.group(2)
3511
3512         video_extension = 'flv'
3513
3514         # Step 1, Retrieve post webpage to extract further information
3515         self.report_extract_entry(post_url)
3516         request = compat_urllib_request.Request(post_url)
3517         try:
3518             webpage = compat_urllib_request.urlopen(request).read()
3519         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3520             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3521             return
3522
3523         # Extract update date
3524         upload_date = None
3525         pattern = 'title="Timestamp">(.*?)</a>'
3526         mobj = re.search(pattern, webpage)
3527         if mobj:
3528             upload_date = mobj.group(1)
3529             # Convert timestring to a format suitable for filename
3530             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3531             upload_date = upload_date.strftime('%Y%m%d')
3532         self.report_date(upload_date)
3533
3534         # Extract uploader
3535         uploader = None
3536         pattern = r'rel\="author".*?>(.*?)</a>'
3537         mobj = re.search(pattern, webpage)
3538         if mobj:
3539             uploader = mobj.group(1)
3540         self.report_uploader(uploader)
3541
3542         # Extract title
3543         # Get the first line for title
3544         video_title = u'NA'
3545         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3546         mobj = re.search(pattern, webpage)
3547         if mobj:
3548             video_title = mobj.group(1)
3549         self.report_title(video_title)
3550
3551         # Step 2, Stimulate clicking the image box to launch video
3552         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3553         mobj = re.search(pattern, webpage)
3554         if mobj is None:
3555             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3556
3557         video_page = mobj.group(1)
3558         request = compat_urllib_request.Request(video_page)
3559         try:
3560             webpage = compat_urllib_request.urlopen(request).read()
3561         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3562             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3563             return
3564         self.report_extract_vid_page(video_page)
3565
3566
3567         # Extract video links on video page
3568         """Extract video links of all sizes"""
3569         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3570         mobj = re.findall(pattern, webpage)
3571         if len(mobj) == 0:
3572             self._downloader.trouble(u'ERROR: unable to extract video links')
3573
3574         # Sort in resolution
3575         links = sorted(mobj)
3576
3577         # Choose the lowest of the sort, i.e. highest resolution
3578         video_url = links[-1]
3579         # Only get the url. The resolution part in the tuple has no use anymore
3580         video_url = video_url[-1]
3581         # Treat escaped \u0026 style hex
3582         video_url = unicode(video_url, "unicode_escape")
3583
3584
3585         return [{
3586             'id':       video_id.decode('utf-8'),
3587             'url':      video_url,
3588             'uploader': uploader.decode('utf-8'),
3589             'upload_date':  upload_date.decode('utf-8'),
3590             'title':    video_title.decode('utf-8'),
3591             'ext':      video_extension.decode('utf-8'),
3592         }]