Fix Dailymotion in Python 3
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21     """Information Extractor class.
22
23     Information extractors are the classes that, given a URL, extract
24     information about the video (or videos) the URL refers to. This
25     information includes the real video URL, the video title, author and
26     others. The information is stored in a dictionary which is then 
27     passed to the FileDownloader. The FileDownloader processes this
28     information possibly downloading the video to the file system, among
29     other possible outcomes.
30
31     The dictionaries must include the following fields:
32
33     id:             Video identifier.
34     url:            Final video URL.
35     uploader:       Nickname of the video uploader, unescaped.
36     upload_date:    Video upload date (YYYYMMDD).
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     player_url:     SWF Player URL (used for rtmpdump).
46     subtitles:      The .srt file contents.
47     urlhandle:      [internal] The urlHandle to be used to download the file,
48                     like returned by urllib.request.urlopen
49
50     The fields should all be Unicode strings.
51
52     Subclasses of this one should re-define the _real_initialize() and
53     _real_extract() methods and define a _VALID_URL regexp.
54     Probably, they should also be added to the list of extractors.
55
56     _real_extract() must return a *list* of information dictionaries as
57     described above.
58
59     Finally, the _WORKING attribute should be set to False for broken IEs
60     in order to warn the users and skip the tests.
61     """
62
63     _ready = False
64     _downloader = None
65     _WORKING = True
66
67     def __init__(self, downloader=None):
68         """Constructor. Receives an optional downloader."""
69         self._ready = False
70         self.set_downloader(downloader)
71
72     def suitable(self, url):
73         """Receives a URL and returns True if suitable for this IE."""
74         return re.match(self._VALID_URL, url) is not None
75
76     def working(self):
77         """Getter method for _WORKING."""
78         return self._WORKING
79
80     def initialize(self):
81         """Initializes an instance (authentication, etc)."""
82         if not self._ready:
83             self._real_initialize()
84             self._ready = True
85
86     def extract(self, url):
87         """Extracts URL information and returns it in list of dicts."""
88         self.initialize()
89         return self._real_extract(url)
90
91     def set_downloader(self, downloader):
92         """Sets the downloader for this IE."""
93         self._downloader = downloader
94
95     def _real_initialize(self):
96         """Real initialization process. Redefine in subclasses."""
97         pass
98
99     def _real_extract(self, url):
100         """Real extraction process. Redefine in subclasses."""
101         pass
102
103
104 class YoutubeIE(InfoExtractor):
105     """Information extractor for youtube.com."""
106
107     _VALID_URL = r"""^
108                      (
109                          (?:https?://)?                                       # http(s):// (optional)
110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
114                          (?:                                                  # the various things that can precede the ID:
115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
116                              |(?:                                             # or the v= param in all its forms
117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
120                                  v=
121                              )
122                          )?                                                   # optional -> youtube.com/xxxx is OK
123                      )?                                                       # all until now is optional -> you can pass the naked ID
124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
125                      (?(1).+)?                                                # if we found the ID, everything can follow
126                      $"""
127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131     _NETRC_MACHINE = 'youtube'
132     # Listed in order of quality
133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135     _video_extensions = {
136         '13': '3gp',
137         '17': 'mp4',
138         '18': 'mp4',
139         '22': 'mp4',
140         '37': 'mp4',
141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142         '43': 'webm',
143         '44': 'webm',
144         '45': 'webm',
145         '46': 'webm',
146     }
147     _video_dimensions = {
148         '5': '240x400',
149         '6': '???',
150         '13': '???',
151         '17': '144x176',
152         '18': '360x640',
153         '22': '720x1280',
154         '34': '360x640',
155         '35': '480x854',
156         '37': '1080x1920',
157         '38': '3072x4096',
158         '43': '360x640',
159         '44': '480x854',
160         '45': '720x1280',
161         '46': '1080x1920',
162     }   
163     IE_NAME = u'youtube'
164
165     def suitable(self, url):
166         """Receives a URL and returns True if suitable for this IE."""
167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169     def report_lang(self):
170         """Report attempt to set language."""
171         self._downloader.to_screen(u'[youtube] Setting language')
172
173     def report_login(self):
174         """Report attempt to log in."""
175         self._downloader.to_screen(u'[youtube] Logging in')
176
177     def report_age_confirmation(self):
178         """Report attempt to confirm age."""
179         self._downloader.to_screen(u'[youtube] Confirming age')
180
181     def report_video_webpage_download(self, video_id):
182         """Report attempt to download video webpage."""
183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185     def report_video_info_webpage_download(self, video_id):
186         """Report attempt to download video info webpage."""
187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189     def report_video_subtitles_download(self, video_id):
190         """Report attempt to download video info webpage."""
191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193     def report_information_extraction(self, video_id):
194         """Report attempt to extract video information."""
195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197     def report_unavailable_format(self, video_id, format):
198         """Report extracted video URL."""
199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201     def report_rtmp_download(self):
202         """Indicate the download will use the RTMP protocol."""
203         self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205     def _closed_captions_xml_to_srt(self, xml_string):
206         srt = ''
207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208         # TODO parse xml instead of regex
209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
210             if not dur: dur = '4'
211             start = float(start)
212             end = start + float(dur)
213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215             caption = unescapeHTML(caption)
216             caption = unescapeHTML(caption) # double cycle, intentional
217             srt += str(n+1) + '\n'
218             srt += start + ' --> ' + end + '\n'
219             srt += caption + '\n\n'
220         return srt
221
222     def _print_formats(self, formats):
223         print('Available formats:')
224         for x in formats:
225             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227     def _real_initialize(self):
228         if self._downloader is None:
229             return
230
231         username = None
232         password = None
233         downloader_params = self._downloader.params
234
235         # Attempt to use provided username and password or .netrc data
236         if downloader_params.get('username', None) is not None:
237             username = downloader_params['username']
238             password = downloader_params['password']
239         elif downloader_params.get('usenetrc', False):
240             try:
241                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242                 if info is not None:
243                     username = info[0]
244                     password = info[2]
245                 else:
246                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247             except (IOError, netrc.NetrcParseError) as err:
248                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249                 return
250
251         # Set language
252         request = compat_urllib_request.Request(self._LANG_URL)
253         try:
254             self.report_lang()
255             compat_urllib_request.urlopen(request).read()
256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258             return
259
260         # No authentication to be performed
261         if username is None:
262             return
263
264         # Log in
265         login_form = {
266                 'current_form': 'loginForm',
267                 'next':     '/',
268                 'action_login': 'Log In',
269                 'username': username,
270                 'password': password,
271                 }
272         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273         try:
274             self.report_login()
275             login_results = compat_urllib_request.urlopen(request).read()
276             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278                 return
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281             return
282
283         # Confirm age
284         age_form = {
285                 'next_url':     '/',
286                 'action_confirm':   'Confirm',
287                 }
288         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289         try:
290             self.report_age_confirmation()
291             age_results = compat_urllib_request.urlopen(request).read()
292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294             return
295
296     def _real_extract(self, url):
297         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298         mobj = re.search(self._NEXT_URL_RE, url)
299         if mobj:
300             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302         # Extract video id from URL
303         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304         if mobj is None:
305             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306             return
307         video_id = mobj.group(2)
308
309         # Get video webpage
310         self.report_video_webpage_download(video_id)
311         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312         try:
313             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316             return
317
318         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320         # Attempt to extract SWF player URL
321         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322         if mobj is not None:
323             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324         else:
325             player_url = None
326
327         # Get video info
328         self.report_video_info_webpage_download(video_id)
329         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331                     % (video_id, el_type))
332             request = compat_urllib_request.Request(video_info_url)
333             try:
334                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336                 video_info = compat_parse_qs(video_info_webpage)
337                 if 'token' in video_info:
338                     break
339             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341                 return
342         if 'token' not in video_info:
343             if 'reason' in video_info:
344                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345             else:
346                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347             return
348
349         # Check for "rental" videos
350         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351             self._downloader.trouble(u'ERROR: "rental" videos not supported')
352             return
353
354         # Start extracting information
355         self.report_information_extraction(video_id)
356
357         # uploader
358         if 'author' not in video_info:
359             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360             return
361         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363         # title
364         if 'title' not in video_info:
365             self._downloader.trouble(u'ERROR: unable to extract video title')
366             return
367         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369         # thumbnail image
370         if 'thumbnail_url' not in video_info:
371             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372             video_thumbnail = ''
373         else:   # don't panic if we can't find it
374             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376         # upload date
377         upload_date = None
378         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379         if mobj is not None:
380             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382             for expression in format_expressions:
383                 try:
384                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385                 except:
386                     pass
387
388         # description
389         video_description = get_element_by_id("eow-description", video_webpage)
390         if video_description:
391             video_description = clean_html(video_description)
392         else:
393             video_description = ''
394
395         # closed captions
396         video_subtitles = None
397         if self._downloader.params.get('writesubtitles', False):
398             try:
399                 self.report_video_subtitles_download(video_id)
400                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401                 try:
402                     srt_list = compat_urllib_request.urlopen(request).read()
403                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407                 if not srt_lang_list:
408                     raise Trouble(u'WARNING: video has no closed captions')
409                 if self._downloader.params.get('subtitleslang', False):
410                     srt_lang = self._downloader.params.get('subtitleslang')
411                 elif 'en' in srt_lang_list:
412                     srt_lang = 'en'
413                 else:
414                     srt_lang = srt_lang_list.keys()[0]
415                 if not srt_lang in srt_lang_list:
416                     raise Trouble(u'WARNING: no closed captions found in the specified language')
417                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418                 try:
419                     srt_xml = compat_urllib_request.urlopen(request).read()
420                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422                 if not srt_xml:
423                     raise Trouble(u'WARNING: unable to download video subtitles')
424                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
425             except Trouble as trouble:
426                 self._downloader.trouble(trouble[0])
427
428         if 'length_seconds' not in video_info:
429             self._downloader.trouble(u'WARNING: unable to extract video duration')
430             video_duration = ''
431         else:
432             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434         # token
435         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437         # Decide which formats to download
438         req_format = self._downloader.params.get('format', None)
439
440         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441             self.report_rtmp_download()
442             video_url_list = [(None, video_info['conn'][0])]
443         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449             format_limit = self._downloader.params.get('format_limit', None)
450             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451             if format_limit is not None and format_limit in available_formats:
452                 format_list = available_formats[available_formats.index(format_limit):]
453             else:
454                 format_list = available_formats
455             existing_formats = [x for x in format_list if x in url_map]
456             if len(existing_formats) == 0:
457                 self._downloader.trouble(u'ERROR: no known formats available for video')
458                 return
459             if self._downloader.params.get('listformats', None):
460                 self._print_formats(existing_formats)
461                 return
462             if req_format is None or req_format == 'best':
463                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464             elif req_format == 'worst':
465                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466             elif req_format in ('-1', 'all'):
467                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468             else:
469                 # Specific formats. We pick the first in a slash-delimeted sequence.
470                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471                 req_formats = req_format.split('/')
472                 video_url_list = None
473                 for rf in req_formats:
474                     if rf in url_map:
475                         video_url_list = [(rf, url_map[rf])]
476                         break
477                 if video_url_list is None:
478                     self._downloader.trouble(u'ERROR: requested format not available')
479                     return
480         else:
481             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482             return
483
484         results = []
485         for format_param, video_real_url in video_url_list:
486             # Extension
487             video_extension = self._video_extensions.get(format_param, 'flv')
488
489             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490                                               self._video_dimensions.get(format_param, '???'))
491
492             results.append({
493                 'id':       video_id,
494                 'url':      video_real_url,
495                 'uploader': video_uploader,
496                 'upload_date':  upload_date,
497                 'title':    video_title,
498                 'ext':      video_extension,
499                 'format':   video_format,
500                 'thumbnail':    video_thumbnail,
501                 'description':  video_description,
502                 'player_url':   player_url,
503                 'subtitles':    video_subtitles,
504                 'duration':     video_duration
505             })
506         return results
507
508
509 class MetacafeIE(InfoExtractor):
510     """Information Extractor for metacafe.com."""
511
512     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515     IE_NAME = u'metacafe'
516
517     def __init__(self, downloader=None):
518         InfoExtractor.__init__(self, downloader)
519
520     def report_disclaimer(self):
521         """Report disclaimer retrieval."""
522         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524     def report_age_confirmation(self):
525         """Report attempt to confirm age."""
526         self._downloader.to_screen(u'[metacafe] Confirming age')
527
528     def report_download_webpage(self, video_id):
529         """Report webpage download."""
530         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532     def report_extraction(self, video_id):
533         """Report information extraction."""
534         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536     def _real_initialize(self):
537         # Retrieve disclaimer
538         request = compat_urllib_request.Request(self._DISCLAIMER)
539         try:
540             self.report_disclaimer()
541             disclaimer = compat_urllib_request.urlopen(request).read()
542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544             return
545
546         # Confirm age
547         disclaimer_form = {
548             'filters': '0',
549             'submit': "Continue - I'm over 18",
550             }
551         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552         try:
553             self.report_age_confirmation()
554             disclaimer = compat_urllib_request.urlopen(request).read()
555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557             return
558
559     def _real_extract(self, url):
560         # Extract id and simplified title from URL
561         mobj = re.match(self._VALID_URL, url)
562         if mobj is None:
563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564             return
565
566         video_id = mobj.group(1)
567
568         # Check if video comes from YouTube
569         mobj2 = re.match(r'^yt-(.*)$', video_id)
570         if mobj2 is not None:
571             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572             return
573
574         # Retrieve video webpage to extract further information
575         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576         try:
577             self.report_download_webpage(video_id)
578             webpage = compat_urllib_request.urlopen(request).read()
579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581             return
582
583         # Extract URL, uploader and title from webpage
584         self.report_extraction(video_id)
585         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586         if mobj is not None:
587             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588             video_extension = mediaURL[-3:]
589
590             # Extract gdaKey if available
591             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592             if mobj is None:
593                 video_url = mediaURL
594             else:
595                 gdaKey = mobj.group(1)
596                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597         else:
598             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599             if mobj is None:
600                 self._downloader.trouble(u'ERROR: unable to extract media URL')
601                 return
602             vardict = compat_parse_qs(mobj.group(1))
603             if 'mediaData' not in vardict:
604                 self._downloader.trouble(u'ERROR: unable to extract media URL')
605                 return
606             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607             if mobj is None:
608                 self._downloader.trouble(u'ERROR: unable to extract media URL')
609                 return
610             mediaURL = mobj.group(1).replace('\\/', '/')
611             video_extension = mediaURL[-3:]
612             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615         if mobj is None:
616             self._downloader.trouble(u'ERROR: unable to extract title')
617             return
618         video_title = mobj.group(1).decode('utf-8')
619
620         mobj = re.search(r'submitter=(.*?);', webpage)
621         if mobj is None:
622             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623             return
624         video_uploader = mobj.group(1)
625
626         return [{
627             'id':       video_id.decode('utf-8'),
628             'url':      video_url.decode('utf-8'),
629             'uploader': video_uploader.decode('utf-8'),
630             'upload_date':  None,
631             'title':    video_title,
632             'ext':      video_extension.decode('utf-8'),
633         }]
634
635
636 class DailymotionIE(InfoExtractor):
637     """Information Extractor for Dailymotion"""
638
639     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640     IE_NAME = u'dailymotion'
641
642     def __init__(self, downloader=None):
643         InfoExtractor.__init__(self, downloader)
644
645     def report_download_webpage(self, video_id):
646         """Report webpage download."""
647         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649     def report_extraction(self, video_id):
650         """Report information extraction."""
651         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653     def _real_extract(self, url):
654         # Extract id and simplified title from URL
655         mobj = re.match(self._VALID_URL, url)
656         if mobj is None:
657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658             return
659
660         video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662         video_extension = 'mp4'
663
664         # Retrieve video webpage to extract further information
665         request = compat_urllib_request.Request(url)
666         request.add_header('Cookie', 'family_filter=off')
667         try:
668             self.report_download_webpage(video_id)
669             webpage_bytes = compat_urllib_request.urlopen(request).read()
670             webpage = webpage_bytes.decode('utf-8')
671         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
673             return
674
675         # Extract URL, uploader and title from webpage
676         self.report_extraction(video_id)
677         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
678         if mobj is None:
679             self._downloader.trouble(u'ERROR: unable to extract media URL')
680             return
681         flashvars = compat_urllib_parse.unquote(mobj.group(1))
682
683         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
684             if key in flashvars:
685                 max_quality = key
686                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
687                 break
688         else:
689             self._downloader.trouble(u'ERROR: unable to extract video URL')
690             return
691
692         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
693         if mobj is None:
694             self._downloader.trouble(u'ERROR: unable to extract video URL')
695             return
696
697         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
698
699         # TODO: support choosing qualities
700
701         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
702         if mobj is None:
703             self._downloader.trouble(u'ERROR: unable to extract title')
704             return
705         video_title = unescapeHTML(mobj.group('title'))
706
707         video_uploader = None
708         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
709         if mobj is None:
710             # lookin for official user
711             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712             if mobj_official is None:
713                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
714             else:
715                 video_uploader = mobj_official.group(1)
716         else:
717             video_uploader = mobj.group(1)
718
719         video_upload_date = None
720         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
721         if mobj is not None:
722             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
723
724         return [{
725             'id':       video_id,
726             'url':      video_url,
727             'uploader': video_uploader,
728             'upload_date':  video_upload_date,
729             'title':    video_title,
730             'ext':      video_extension,
731         }]
732
733
734 class GoogleIE(InfoExtractor):
735     """Information extractor for video.google.com."""
736
737     _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
738     IE_NAME = u'video.google'
739
740     def __init__(self, downloader=None):
741         InfoExtractor.__init__(self, downloader)
742
743     def report_download_webpage(self, video_id):
744         """Report webpage download."""
745         self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
746
747     def report_extraction(self, video_id):
748         """Report information extraction."""
749         self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
750
751     def _real_extract(self, url):
752         # Extract id from URL
753         mobj = re.match(self._VALID_URL, url)
754         if mobj is None:
755             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
756             return
757
758         video_id = mobj.group(1)
759
760         video_extension = 'mp4'
761
762         # Retrieve video webpage to extract further information
763         request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
764         try:
765             self.report_download_webpage(video_id)
766             webpage = compat_urllib_request.urlopen(request).read()
767         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
769             return
770
771         # Extract URL, uploader, and title from webpage
772         self.report_extraction(video_id)
773         mobj = re.search(r"download_url:'([^']+)'", webpage)
774         if mobj is None:
775             video_extension = 'flv'
776             mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
777         if mobj is None:
778             self._downloader.trouble(u'ERROR: unable to extract media URL')
779             return
780         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781         mediaURL = mediaURL.replace('\\x3d', '\x3d')
782         mediaURL = mediaURL.replace('\\x26', '\x26')
783
784         video_url = mediaURL
785
786         mobj = re.search(r'<title>(.*)</title>', webpage)
787         if mobj is None:
788             self._downloader.trouble(u'ERROR: unable to extract title')
789             return
790         video_title = mobj.group(1).decode('utf-8')
791
792         # Extract video description
793         mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
794         if mobj is None:
795             self._downloader.trouble(u'ERROR: unable to extract video description')
796             return
797         video_description = mobj.group(1).decode('utf-8')
798         if not video_description:
799             video_description = 'No description available.'
800
801         # Extract video thumbnail
802         if self._downloader.params.get('forcethumbnail', False):
803             request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
804             try:
805                 webpage = compat_urllib_request.urlopen(request).read()
806             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
808                 return
809             mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
810             if mobj is None:
811                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
812                 return
813             video_thumbnail = mobj.group(1)
814         else:   # we need something to pass to process_info
815             video_thumbnail = ''
816
817         return [{
818             'id':       video_id.decode('utf-8'),
819             'url':      video_url.decode('utf-8'),
820             'uploader': None,
821             'upload_date':  None,
822             'title':    video_title,
823             'ext':      video_extension.decode('utf-8'),
824         }]
825
826
827 class PhotobucketIE(InfoExtractor):
828     """Information extractor for photobucket.com."""
829
830     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831     IE_NAME = u'photobucket'
832
833     def __init__(self, downloader=None):
834         InfoExtractor.__init__(self, downloader)
835
836     def report_download_webpage(self, video_id):
837         """Report webpage download."""
838         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
839
840     def report_extraction(self, video_id):
841         """Report information extraction."""
842         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
843
844     def _real_extract(self, url):
845         # Extract id from URL
846         mobj = re.match(self._VALID_URL, url)
847         if mobj is None:
848             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
849             return
850
851         video_id = mobj.group(1)
852
853         video_extension = 'flv'
854
855         # Retrieve video webpage to extract further information
856         request = compat_urllib_request.Request(url)
857         try:
858             self.report_download_webpage(video_id)
859             webpage = compat_urllib_request.urlopen(request).read()
860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
862             return
863
864         # Extract URL, uploader, and title from webpage
865         self.report_extraction(video_id)
866         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
867         if mobj is None:
868             self._downloader.trouble(u'ERROR: unable to extract media URL')
869             return
870         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
871
872         video_url = mediaURL
873
874         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
875         if mobj is None:
876             self._downloader.trouble(u'ERROR: unable to extract title')
877             return
878         video_title = mobj.group(1).decode('utf-8')
879
880         video_uploader = mobj.group(2).decode('utf-8')
881
882         return [{
883             'id':       video_id.decode('utf-8'),
884             'url':      video_url.decode('utf-8'),
885             'uploader': video_uploader,
886             'upload_date':  None,
887             'title':    video_title,
888             'ext':      video_extension.decode('utf-8'),
889         }]
890
891
892 class YahooIE(InfoExtractor):
893     """Information extractor for video.yahoo.com."""
894
895     # _VALID_URL matches all Yahoo! Video URLs
896     # _VPAGE_URL matches only the extractable '/watch/' URLs
897     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
898     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
899     IE_NAME = u'video.yahoo'
900
901     def __init__(self, downloader=None):
902         InfoExtractor.__init__(self, downloader)
903
904     def report_download_webpage(self, video_id):
905         """Report webpage download."""
906         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
907
908     def report_extraction(self, video_id):
909         """Report information extraction."""
910         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
911
912     def _real_extract(self, url, new_video=True):
913         # Extract ID from URL
914         mobj = re.match(self._VALID_URL, url)
915         if mobj is None:
916             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
917             return
918
919         video_id = mobj.group(2)
920         video_extension = 'flv'
921
922         # Rewrite valid but non-extractable URLs as
923         # extractable English language /watch/ URLs
924         if re.match(self._VPAGE_URL, url) is None:
925             request = compat_urllib_request.Request(url)
926             try:
927                 webpage = compat_urllib_request.urlopen(request).read()
928             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
929                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
930                 return
931
932             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
933             if mobj is None:
934                 self._downloader.trouble(u'ERROR: Unable to extract id field')
935                 return
936             yahoo_id = mobj.group(1)
937
938             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
939             if mobj is None:
940                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
941                 return
942             yahoo_vid = mobj.group(1)
943
944             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
945             return self._real_extract(url, new_video=False)
946
947         # Retrieve video webpage to extract further information
948         request = compat_urllib_request.Request(url)
949         try:
950             self.report_download_webpage(video_id)
951             webpage = compat_urllib_request.urlopen(request).read()
952         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
953             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
954             return
955
956         # Extract uploader and title from webpage
957         self.report_extraction(video_id)
958         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
959         if mobj is None:
960             self._downloader.trouble(u'ERROR: unable to extract video title')
961             return
962         video_title = mobj.group(1).decode('utf-8')
963
964         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
965         if mobj is None:
966             self._downloader.trouble(u'ERROR: unable to extract video uploader')
967             return
968         video_uploader = mobj.group(1).decode('utf-8')
969
970         # Extract video thumbnail
971         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
972         if mobj is None:
973             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
974             return
975         video_thumbnail = mobj.group(1).decode('utf-8')
976
977         # Extract video description
978         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
979         if mobj is None:
980             self._downloader.trouble(u'ERROR: unable to extract video description')
981             return
982         video_description = mobj.group(1).decode('utf-8')
983         if not video_description:
984             video_description = 'No description available.'
985
986         # Extract video height and width
987         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
988         if mobj is None:
989             self._downloader.trouble(u'ERROR: unable to extract video height')
990             return
991         yv_video_height = mobj.group(1)
992
993         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
994         if mobj is None:
995             self._downloader.trouble(u'ERROR: unable to extract video width')
996             return
997         yv_video_width = mobj.group(1)
998
999         # Retrieve video playlist to extract media URL
1000         # I'm not completely sure what all these options are, but we
1001         # seem to need most of them, otherwise the server sends a 401.
1002         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1003         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1004         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1005                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1006                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1007         try:
1008             self.report_download_webpage(video_id)
1009             webpage = compat_urllib_request.urlopen(request).read()
1010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1012             return
1013
1014         # Extract media URL from playlist XML
1015         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1016         if mobj is None:
1017             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1018             return
1019         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1020         video_url = unescapeHTML(video_url)
1021
1022         return [{
1023             'id':       video_id.decode('utf-8'),
1024             'url':      video_url,
1025             'uploader': video_uploader,
1026             'upload_date':  None,
1027             'title':    video_title,
1028             'ext':      video_extension.decode('utf-8'),
1029             'thumbnail':    video_thumbnail.decode('utf-8'),
1030             'description':  video_description,
1031         }]
1032
1033
1034 class VimeoIE(InfoExtractor):
1035     """Information extractor for vimeo.com."""
1036
1037     # _VALID_URL matches Vimeo URLs
1038     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1039     IE_NAME = u'vimeo'
1040
1041     def __init__(self, downloader=None):
1042         InfoExtractor.__init__(self, downloader)
1043
1044     def report_download_webpage(self, video_id):
1045         """Report webpage download."""
1046         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1047
1048     def report_extraction(self, video_id):
1049         """Report information extraction."""
1050         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1051
1052     def _real_extract(self, url, new_video=True):
1053         # Extract ID from URL
1054         mobj = re.match(self._VALID_URL, url)
1055         if mobj is None:
1056             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1057             return
1058
1059         video_id = mobj.group(1)
1060
1061         # Retrieve video webpage to extract further information
1062         request = compat_urllib_request.Request(url, None, std_headers)
1063         try:
1064             self.report_download_webpage(video_id)
1065             webpage = compat_urllib_request.urlopen(request).read()
1066         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1067             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1068             return
1069
1070         # Now we begin extracting as much information as we can from what we
1071         # retrieved. First we extract the information common to all extractors,
1072         # and latter we extract those that are Vimeo specific.
1073         self.report_extraction(video_id)
1074
1075         # Extract the config JSON
1076         try:
1077             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1078             config = json.loads(config)
1079         except:
1080             self._downloader.trouble(u'ERROR: unable to extract info section')
1081             return
1082         
1083         # Extract title
1084         video_title = config["video"]["title"]
1085
1086         # Extract uploader
1087         video_uploader = config["video"]["owner"]["name"]
1088
1089         # Extract video thumbnail
1090         video_thumbnail = config["video"]["thumbnail"]
1091
1092         # Extract video description
1093         video_description = get_element_by_id("description", webpage.decode('utf8'))
1094         if video_description: video_description = clean_html(video_description)
1095         else: video_description = ''
1096
1097         # Extract upload date
1098         video_upload_date = None
1099         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1100         if mobj is not None:
1101             video_upload_date = mobj.group(1)
1102
1103         # Vimeo specific: extract request signature and timestamp
1104         sig = config['request']['signature']
1105         timestamp = config['request']['timestamp']
1106
1107         # Vimeo specific: extract video codec and quality information
1108         # First consider quality, then codecs, then take everything
1109         # TODO bind to format param
1110         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1111         files = { 'hd': [], 'sd': [], 'other': []}
1112         for codec_name, codec_extension in codecs:
1113             if codec_name in config["video"]["files"]:
1114                 if 'hd' in config["video"]["files"][codec_name]:
1115                     files['hd'].append((codec_name, codec_extension, 'hd'))
1116                 elif 'sd' in config["video"]["files"][codec_name]:
1117                     files['sd'].append((codec_name, codec_extension, 'sd'))
1118                 else:
1119                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1120
1121         for quality in ('hd', 'sd', 'other'):
1122             if len(files[quality]) > 0:
1123                 video_quality = files[quality][0][2]
1124                 video_codec = files[quality][0][0]
1125                 video_extension = files[quality][0][1]
1126                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1127                 break
1128         else:
1129             self._downloader.trouble(u'ERROR: no known codec found')
1130             return
1131
1132         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1133                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1134
1135         return [{
1136             'id':       video_id,
1137             'url':      video_url,
1138             'uploader': video_uploader,
1139             'upload_date':  video_upload_date,
1140             'title':    video_title,
1141             'ext':      video_extension,
1142             'thumbnail':    video_thumbnail,
1143             'description':  video_description,
1144         }]
1145
1146
1147 class ArteTvIE(InfoExtractor):
1148     """arte.tv information extractor."""
1149
1150     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1151     _LIVE_URL = r'index-[0-9]+\.html$'
1152
1153     IE_NAME = u'arte.tv'
1154
1155     def __init__(self, downloader=None):
1156         InfoExtractor.__init__(self, downloader)
1157
1158     def report_download_webpage(self, video_id):
1159         """Report webpage download."""
1160         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1161
1162     def report_extraction(self, video_id):
1163         """Report information extraction."""
1164         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1165
1166     def fetch_webpage(self, url):
1167         self._downloader.increment_downloads()
1168         request = compat_urllib_request.Request(url)
1169         try:
1170             self.report_download_webpage(url)
1171             webpage = compat_urllib_request.urlopen(request).read()
1172         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1173             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1174             return
1175         except ValueError as err:
1176             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1177             return
1178         return webpage
1179
1180     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1181         page = self.fetch_webpage(url)
1182         mobj = re.search(regex, page, regexFlags)
1183         info = {}
1184
1185         if mobj is None:
1186             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187             return
1188
1189         for (i, key, err) in matchTuples:
1190             if mobj.group(i) is None:
1191                 self._downloader.trouble(err)
1192                 return
1193             else:
1194                 info[key] = mobj.group(i)
1195
1196         return info
1197
1198     def extractLiveStream(self, url):
1199         video_lang = url.split('/')[-4]
1200         info = self.grep_webpage(
1201             url,
1202             r'src="(.*?/videothek_js.*?\.js)',
1203             0,
1204             [
1205                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1206             ]
1207         )
1208         http_host = url.split('/')[2]
1209         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1210         info = self.grep_webpage(
1211             next_url,
1212             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1213                 '(http://.*?\.swf).*?' +
1214                 '(rtmp://.*?)\'',
1215             re.DOTALL,
1216             [
1217                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1218                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1219                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1220             ]
1221         )
1222         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1223
1224     def extractPlus7Stream(self, url):
1225         video_lang = url.split('/')[-3]
1226         info = self.grep_webpage(
1227             url,
1228             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1229             0,
1230             [
1231                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1232             ]
1233         )
1234         next_url = compat_urllib_parse.unquote(info.get('url'))
1235         info = self.grep_webpage(
1236             next_url,
1237             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1238             0,
1239             [
1240                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1241             ]
1242         )
1243         next_url = compat_urllib_parse.unquote(info.get('url'))
1244
1245         info = self.grep_webpage(
1246             next_url,
1247             r'<video id="(.*?)".*?>.*?' +
1248                 '<name>(.*?)</name>.*?' +
1249                 '<dateVideo>(.*?)</dateVideo>.*?' +
1250                 '<url quality="hd">(.*?)</url>',
1251             re.DOTALL,
1252             [
1253                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1254                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1255                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1256                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1257             ]
1258         )
1259
1260         return {
1261             'id':           info.get('id'),
1262             'url':          compat_urllib_parse.unquote(info.get('url')),
1263             'uploader':     u'arte.tv',
1264             'upload_date':  info.get('date'),
1265             'title':        info.get('title'),
1266             'ext':          u'mp4',
1267             'format':       u'NA',
1268             'player_url':   None,
1269         }
1270
1271     def _real_extract(self, url):
1272         video_id = url.split('/')[-1]
1273         self.report_extraction(video_id)
1274
1275         if re.search(self._LIVE_URL, video_id) is not None:
1276             self.extractLiveStream(url)
1277             return
1278         else:
1279             info = self.extractPlus7Stream(url)
1280
1281         return [info]
1282
1283
1284 class GenericIE(InfoExtractor):
1285     """Generic last-resort information extractor."""
1286
1287     _VALID_URL = r'.*'
1288     IE_NAME = u'generic'
1289
1290     def __init__(self, downloader=None):
1291         InfoExtractor.__init__(self, downloader)
1292
1293     def report_download_webpage(self, video_id):
1294         """Report webpage download."""
1295         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1296         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1297
1298     def report_extraction(self, video_id):
1299         """Report information extraction."""
1300         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1301
1302     def report_following_redirect(self, new_url):
1303         """Report information extraction."""
1304         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1305         
1306     def _test_redirect(self, url):
1307         """Check if it is a redirect, like url shorteners, in case restart chain."""
1308         class HeadRequest(compat_urllib_request.Request):
1309             def get_method(self):
1310                 return "HEAD"
1311
1312         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1313             """
1314             Subclass the HTTPRedirectHandler to make it use our 
1315             HeadRequest also on the redirected URL
1316             """
1317             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1318                 if code in (301, 302, 303, 307):
1319                     newurl = newurl.replace(' ', '%20') 
1320                     newheaders = dict((k,v) for k,v in req.headers.items()
1321                                       if k.lower() not in ("content-length", "content-type"))
1322                     return HeadRequest(newurl, 
1323                                        headers=newheaders,
1324                                        origin_req_host=req.get_origin_req_host(), 
1325                                        unverifiable=True) 
1326                 else: 
1327                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1328
1329         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1330             """
1331             Fallback to GET if HEAD is not allowed (405 HTTP error)
1332             """
1333             def http_error_405(self, req, fp, code, msg, headers): 
1334                 fp.read()
1335                 fp.close()
1336
1337                 newheaders = dict((k,v) for k,v in req.headers.items()
1338                                   if k.lower() not in ("content-length", "content-type"))
1339                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1340                                                  headers=newheaders, 
1341                                                  origin_req_host=req.get_origin_req_host(), 
1342                                                  unverifiable=True))
1343
1344         # Build our opener
1345         opener = compat_urllib_request.OpenerDirector() 
1346         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1347                         HTTPMethodFallback, HEADRedirectHandler,
1348                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1349             opener.add_handler(handler())
1350
1351         response = opener.open(HeadRequest(url))
1352         new_url = response.geturl()
1353
1354         if url == new_url:
1355             return False
1356
1357         self.report_following_redirect(new_url)
1358         self._downloader.download([new_url])
1359         return True
1360
1361     def _real_extract(self, url):
1362         if self._test_redirect(url): return
1363
1364         video_id = url.split('/')[-1]
1365         request = compat_urllib_request.Request(url)
1366         try:
1367             self.report_download_webpage(video_id)
1368             webpage = compat_urllib_request.urlopen(request).read()
1369         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1370             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1371             return
1372         except ValueError as err:
1373             # since this is the last-resort InfoExtractor, if
1374             # this error is thrown, it'll be thrown here
1375             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376             return
1377
1378         self.report_extraction(video_id)
1379         # Start with something easy: JW Player in SWFObject
1380         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1381         if mobj is None:
1382             # Broaden the search a little bit
1383             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1384         if mobj is None:
1385             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1386             return
1387
1388         # It's possible that one of the regexes
1389         # matched, but returned an empty group:
1390         if mobj.group(1) is None:
1391             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1392             return
1393
1394         video_url = compat_urllib_parse.unquote(mobj.group(1))
1395         video_id = os.path.basename(video_url)
1396
1397         # here's a fun little line of code for you:
1398         video_extension = os.path.splitext(video_id)[1][1:]
1399         video_id = os.path.splitext(video_id)[0]
1400
1401         # it's tempting to parse this further, but you would
1402         # have to take into account all the variations like
1403         #   Video Title - Site Name
1404         #   Site Name | Video Title
1405         #   Video Title - Tagline | Site Name
1406         # and so on and so forth; it's just not practical
1407         mobj = re.search(r'<title>(.*)</title>', webpage)
1408         if mobj is None:
1409             self._downloader.trouble(u'ERROR: unable to extract title')
1410             return
1411         video_title = mobj.group(1).decode('utf-8')
1412
1413         # video uploader is domain name
1414         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1415         if mobj is None:
1416             self._downloader.trouble(u'ERROR: unable to extract title')
1417             return
1418         video_uploader = mobj.group(1).decode('utf-8')
1419
1420         return [{
1421             'id':       video_id.decode('utf-8'),
1422             'url':      video_url.decode('utf-8'),
1423             'uploader': video_uploader,
1424             'upload_date':  None,
1425             'title':    video_title,
1426             'ext':      video_extension.decode('utf-8'),
1427         }]
1428
1429
1430 class YoutubeSearchIE(InfoExtractor):
1431     """Information Extractor for YouTube search queries."""
1432     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1433     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1434     _max_youtube_results = 1000
1435     IE_NAME = u'youtube:search'
1436
1437     def __init__(self, downloader=None):
1438         InfoExtractor.__init__(self, downloader)
1439
1440     def report_download_page(self, query, pagenum):
1441         """Report attempt to download search page with given number."""
1442         query = query.decode(preferredencoding())
1443         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1444
1445     def _real_extract(self, query):
1446         mobj = re.match(self._VALID_URL, query)
1447         if mobj is None:
1448             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1449             return
1450
1451         prefix, query = query.split(':')
1452         prefix = prefix[8:]
1453         query = query.encode('utf-8')
1454         if prefix == '':
1455             self._download_n_results(query, 1)
1456             return
1457         elif prefix == 'all':
1458             self._download_n_results(query, self._max_youtube_results)
1459             return
1460         else:
1461             try:
1462                 n = int(prefix)
1463                 if n <= 0:
1464                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1465                     return
1466                 elif n > self._max_youtube_results:
1467                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1468                     n = self._max_youtube_results
1469                 self._download_n_results(query, n)
1470                 return
1471             except ValueError: # parsing prefix as integer fails
1472                 self._download_n_results(query, 1)
1473                 return
1474
1475     def _download_n_results(self, query, n):
1476         """Downloads a specified number of results for a query"""
1477
1478         video_ids = []
1479         pagenum = 0
1480         limit = n
1481
1482         while (50 * pagenum) < limit:
1483             self.report_download_page(query, pagenum+1)
1484             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1485             request = compat_urllib_request.Request(result_url)
1486             try:
1487                 data = compat_urllib_request.urlopen(request).read()
1488             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1489                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1490                 return
1491             api_response = json.loads(data)['data']
1492
1493             new_ids = list(video['id'] for video in api_response['items'])
1494             video_ids += new_ids
1495
1496             limit = min(n, api_response['totalItems'])
1497             pagenum += 1
1498
1499         if len(video_ids) > n:
1500             video_ids = video_ids[:n]
1501         for id in video_ids:
1502             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1503         return
1504
1505
1506 class GoogleSearchIE(InfoExtractor):
1507     """Information Extractor for Google Video search queries."""
1508     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1509     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1510     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1511     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1512     _max_google_results = 1000
1513     IE_NAME = u'video.google:search'
1514
1515     def __init__(self, downloader=None):
1516         InfoExtractor.__init__(self, downloader)
1517
1518     def report_download_page(self, query, pagenum):
1519         """Report attempt to download playlist page with given number."""
1520         query = query.decode(preferredencoding())
1521         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1522
1523     def _real_extract(self, query):
1524         mobj = re.match(self._VALID_URL, query)
1525         if mobj is None:
1526             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1527             return
1528
1529         prefix, query = query.split(':')
1530         prefix = prefix[8:]
1531         query = query.encode('utf-8')
1532         if prefix == '':
1533             self._download_n_results(query, 1)
1534             return
1535         elif prefix == 'all':
1536             self._download_n_results(query, self._max_google_results)
1537             return
1538         else:
1539             try:
1540                 n = int(prefix)
1541                 if n <= 0:
1542                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1543                     return
1544                 elif n > self._max_google_results:
1545                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1546                     n = self._max_google_results
1547                 self._download_n_results(query, n)
1548                 return
1549             except ValueError: # parsing prefix as integer fails
1550                 self._download_n_results(query, 1)
1551                 return
1552
1553     def _download_n_results(self, query, n):
1554         """Downloads a specified number of results for a query"""
1555
1556         video_ids = []
1557         pagenum = 0
1558
1559         while True:
1560             self.report_download_page(query, pagenum)
1561             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1562             request = compat_urllib_request.Request(result_url)
1563             try:
1564                 page = compat_urllib_request.urlopen(request).read()
1565             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1566                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1567                 return
1568
1569             # Extract video identifiers
1570             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1571                 video_id = mobj.group(1)
1572                 if video_id not in video_ids:
1573                     video_ids.append(video_id)
1574                     if len(video_ids) == n:
1575                         # Specified n videos reached
1576                         for id in video_ids:
1577                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1578                         return
1579
1580             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1581                 for id in video_ids:
1582                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1583                 return
1584
1585             pagenum = pagenum + 1
1586
1587
1588 class YahooSearchIE(InfoExtractor):
1589     """Information Extractor for Yahoo! Video search queries."""
1590     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1591     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1592     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1593     _MORE_PAGES_INDICATOR = r'\s*Next'
1594     _max_yahoo_results = 1000
1595     IE_NAME = u'video.yahoo:search'
1596
1597     def __init__(self, downloader=None):
1598         InfoExtractor.__init__(self, downloader)
1599
1600     def report_download_page(self, query, pagenum):
1601         """Report attempt to download playlist page with given number."""
1602         query = query.decode(preferredencoding())
1603         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1604
1605     def _real_extract(self, query):
1606         mobj = re.match(self._VALID_URL, query)
1607         if mobj is None:
1608             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1609             return
1610
1611         prefix, query = query.split(':')
1612         prefix = prefix[8:]
1613         query = query.encode('utf-8')
1614         if prefix == '':
1615             self._download_n_results(query, 1)
1616             return
1617         elif prefix == 'all':
1618             self._download_n_results(query, self._max_yahoo_results)
1619             return
1620         else:
1621             try:
1622                 n = int(prefix)
1623                 if n <= 0:
1624                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1625                     return
1626                 elif n > self._max_yahoo_results:
1627                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1628                     n = self._max_yahoo_results
1629                 self._download_n_results(query, n)
1630                 return
1631             except ValueError: # parsing prefix as integer fails
1632                 self._download_n_results(query, 1)
1633                 return
1634
1635     def _download_n_results(self, query, n):
1636         """Downloads a specified number of results for a query"""
1637
1638         video_ids = []
1639         already_seen = set()
1640         pagenum = 1
1641
1642         while True:
1643             self.report_download_page(query, pagenum)
1644             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1645             request = compat_urllib_request.Request(result_url)
1646             try:
1647                 page = compat_urllib_request.urlopen(request).read()
1648             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650                 return
1651
1652             # Extract video identifiers
1653             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1654                 video_id = mobj.group(1)
1655                 if video_id not in already_seen:
1656                     video_ids.append(video_id)
1657                     already_seen.add(video_id)
1658                     if len(video_ids) == n:
1659                         # Specified n videos reached
1660                         for id in video_ids:
1661                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662                         return
1663
1664             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1665                 for id in video_ids:
1666                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1667                 return
1668
1669             pagenum = pagenum + 1
1670
1671
1672 class YoutubePlaylistIE(InfoExtractor):
1673     """Information Extractor for YouTube playlists."""
1674
1675     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1676     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1677     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1678     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1679     IE_NAME = u'youtube:playlist'
1680
1681     def __init__(self, downloader=None):
1682         InfoExtractor.__init__(self, downloader)
1683
1684     def report_download_page(self, playlist_id, pagenum):
1685         """Report attempt to download playlist page with given number."""
1686         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1687
1688     def _real_extract(self, url):
1689         # Extract playlist id
1690         mobj = re.match(self._VALID_URL, url)
1691         if mobj is None:
1692             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1693             return
1694
1695         # Single video case
1696         if mobj.group(3) is not None:
1697             self._downloader.download([mobj.group(3)])
1698             return
1699
1700         # Download playlist pages
1701         # prefix is 'p' as default for playlists but there are other types that need extra care
1702         playlist_prefix = mobj.group(1)
1703         if playlist_prefix == 'a':
1704             playlist_access = 'artist'
1705         else:
1706             playlist_prefix = 'p'
1707             playlist_access = 'view_play_list'
1708         playlist_id = mobj.group(2)
1709         video_ids = []
1710         pagenum = 1
1711
1712         while True:
1713             self.report_download_page(playlist_id, pagenum)
1714             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1715             request = compat_urllib_request.Request(url)
1716             try:
1717                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1718             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1719                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1720                 return
1721
1722             # Extract video identifiers
1723             ids_in_page = []
1724             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1725                 if mobj.group(1) not in ids_in_page:
1726                     ids_in_page.append(mobj.group(1))
1727             video_ids.extend(ids_in_page)
1728
1729             if self._MORE_PAGES_INDICATOR not in page:
1730                 break
1731             pagenum = pagenum + 1
1732
1733         total = len(video_ids)
1734
1735         playliststart = self._downloader.params.get('playliststart', 1) - 1
1736         playlistend = self._downloader.params.get('playlistend', -1)
1737         if playlistend == -1:
1738             video_ids = video_ids[playliststart:]
1739         else:
1740             video_ids = video_ids[playliststart:playlistend]
1741
1742         if len(video_ids) == total:
1743             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1744         else:
1745             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1746
1747         for id in video_ids:
1748             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1749         return
1750
1751
1752 class YoutubeChannelIE(InfoExtractor):
1753     """Information Extractor for YouTube channels."""
1754
1755     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1756     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1757     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1758     IE_NAME = u'youtube:channel'
1759
1760     def report_download_page(self, channel_id, pagenum):
1761         """Report attempt to download channel page with given number."""
1762         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1763
1764     def _real_extract(self, url):
1765         # Extract channel id
1766         mobj = re.match(self._VALID_URL, url)
1767         if mobj is None:
1768             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1769             return
1770
1771         # Download channel pages
1772         channel_id = mobj.group(1)
1773         video_ids = []
1774         pagenum = 1
1775
1776         while True:
1777             self.report_download_page(channel_id, pagenum)
1778             url = self._TEMPLATE_URL % (channel_id, pagenum)
1779             request = compat_urllib_request.Request(url)
1780             try:
1781                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1782             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1784                 return
1785
1786             # Extract video identifiers
1787             ids_in_page = []
1788             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1789                 if mobj.group(1) not in ids_in_page:
1790                     ids_in_page.append(mobj.group(1))
1791             video_ids.extend(ids_in_page)
1792
1793             if self._MORE_PAGES_INDICATOR not in page:
1794                 break
1795             pagenum = pagenum + 1
1796
1797         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1798
1799         for id in video_ids:
1800             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1801         return
1802
1803
1804 class YoutubeUserIE(InfoExtractor):
1805     """Information Extractor for YouTube users."""
1806
1807     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1808     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1809     _GDATA_PAGE_SIZE = 50
1810     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1811     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1812     IE_NAME = u'youtube:user'
1813
1814     def __init__(self, downloader=None):
1815         InfoExtractor.__init__(self, downloader)
1816
1817     def report_download_page(self, username, start_index):
1818         """Report attempt to download user page."""
1819         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1820                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1821
1822     def _real_extract(self, url):
1823         # Extract username
1824         mobj = re.match(self._VALID_URL, url)
1825         if mobj is None:
1826             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1827             return
1828
1829         username = mobj.group(1)
1830
1831         # Download video ids using YouTube Data API. Result size per
1832         # query is limited (currently to 50 videos) so we need to query
1833         # page by page until there are no video ids - it means we got
1834         # all of them.
1835
1836         video_ids = []
1837         pagenum = 0
1838
1839         while True:
1840             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1841             self.report_download_page(username, start_index)
1842
1843             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1844
1845             try:
1846                 page = compat_urllib_request.urlopen(request).read()
1847             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1849                 return
1850
1851             # Extract video identifiers
1852             ids_in_page = []
1853
1854             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1855                 if mobj.group(1) not in ids_in_page:
1856                     ids_in_page.append(mobj.group(1))
1857
1858             video_ids.extend(ids_in_page)
1859
1860             # A little optimization - if current page is not
1861             # "full", ie. does not contain PAGE_SIZE video ids then
1862             # we can assume that this page is the last one - there
1863             # are no more ids on further pages - no need to query
1864             # again.
1865
1866             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1867                 break
1868
1869             pagenum += 1
1870
1871         all_ids_count = len(video_ids)
1872         playliststart = self._downloader.params.get('playliststart', 1) - 1
1873         playlistend = self._downloader.params.get('playlistend', -1)
1874
1875         if playlistend == -1:
1876             video_ids = video_ids[playliststart:]
1877         else:
1878             video_ids = video_ids[playliststart:playlistend]
1879
1880         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1881                 (username, all_ids_count, len(video_ids)))
1882
1883         for video_id in video_ids:
1884             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1885
1886
1887 class BlipTVUserIE(InfoExtractor):
1888     """Information Extractor for blip.tv users."""
1889
1890     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1891     _PAGE_SIZE = 12
1892     IE_NAME = u'blip.tv:user'
1893
1894     def __init__(self, downloader=None):
1895         InfoExtractor.__init__(self, downloader)
1896
1897     def report_download_page(self, username, pagenum):
1898         """Report attempt to download user page."""
1899         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1900                 (self.IE_NAME, username, pagenum))
1901
1902     def _real_extract(self, url):
1903         # Extract username
1904         mobj = re.match(self._VALID_URL, url)
1905         if mobj is None:
1906             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1907             return
1908
1909         username = mobj.group(1)
1910
1911         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1912
1913         request = compat_urllib_request.Request(url)
1914
1915         try:
1916             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1917             mobj = re.search(r'data-users-id="([^"]+)"', page)
1918             page_base = page_base % mobj.group(1)
1919         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1920             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1921             return
1922
1923
1924         # Download video ids using BlipTV Ajax calls. Result size per
1925         # query is limited (currently to 12 videos) so we need to query
1926         # page by page until there are no video ids - it means we got
1927         # all of them.
1928
1929         video_ids = []
1930         pagenum = 1
1931
1932         while True:
1933             self.report_download_page(username, pagenum)
1934
1935             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1936
1937             try:
1938                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1939             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1940                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1941                 return
1942
1943             # Extract video identifiers
1944             ids_in_page = []
1945
1946             for mobj in re.finditer(r'href="/([^"]+)"', page):
1947                 if mobj.group(1) not in ids_in_page:
1948                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1949
1950             video_ids.extend(ids_in_page)
1951
1952             # A little optimization - if current page is not
1953             # "full", ie. does not contain PAGE_SIZE video ids then
1954             # we can assume that this page is the last one - there
1955             # are no more ids on further pages - no need to query
1956             # again.
1957
1958             if len(ids_in_page) < self._PAGE_SIZE:
1959                 break
1960
1961             pagenum += 1
1962
1963         all_ids_count = len(video_ids)
1964         playliststart = self._downloader.params.get('playliststart', 1) - 1
1965         playlistend = self._downloader.params.get('playlistend', -1)
1966
1967         if playlistend == -1:
1968             video_ids = video_ids[playliststart:]
1969         else:
1970             video_ids = video_ids[playliststart:playlistend]
1971
1972         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1973                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1974
1975         for video_id in video_ids:
1976             self._downloader.download([u'http://blip.tv/'+video_id])
1977
1978
1979 class DepositFilesIE(InfoExtractor):
1980     """Information extractor for depositfiles.com"""
1981
1982     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1983     IE_NAME = u'DepositFiles'
1984
1985     def __init__(self, downloader=None):
1986         InfoExtractor.__init__(self, downloader)
1987
1988     def report_download_webpage(self, file_id):
1989         """Report webpage download."""
1990         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1991
1992     def report_extraction(self, file_id):
1993         """Report information extraction."""
1994         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1995
1996     def _real_extract(self, url):
1997         file_id = url.split('/')[-1]
1998         # Rebuild url in english locale
1999         url = 'http://depositfiles.com/en/files/' + file_id
2000
2001         # Retrieve file webpage with 'Free download' button pressed
2002         free_download_indication = { 'gateway_result' : '1' }
2003         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2004         try:
2005             self.report_download_webpage(file_id)
2006             webpage = compat_urllib_request.urlopen(request).read()
2007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2009             return
2010
2011         # Search for the real file URL
2012         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2013         if (mobj is None) or (mobj.group(1) is None):
2014             # Try to figure out reason of the error.
2015             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2016             if (mobj is not None) and (mobj.group(1) is not None):
2017                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2018                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2019             else:
2020                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2021             return
2022
2023         file_url = mobj.group(1)
2024         file_extension = os.path.splitext(file_url)[1][1:]
2025
2026         # Search for file title
2027         mobj = re.search(r'<b title="(.*?)">', webpage)
2028         if mobj is None:
2029             self._downloader.trouble(u'ERROR: unable to extract title')
2030             return
2031         file_title = mobj.group(1).decode('utf-8')
2032
2033         return [{
2034             'id':       file_id.decode('utf-8'),
2035             'url':      file_url.decode('utf-8'),
2036             'uploader': None,
2037             'upload_date':  None,
2038             'title':    file_title,
2039             'ext':      file_extension.decode('utf-8'),
2040         }]
2041
2042
2043 class FacebookIE(InfoExtractor):
2044     """Information Extractor for Facebook"""
2045
2046     _WORKING = False
2047     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2048     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2049     _NETRC_MACHINE = 'facebook'
2050     _available_formats = ['video', 'highqual', 'lowqual']
2051     _video_extensions = {
2052         'video': 'mp4',
2053         'highqual': 'mp4',
2054         'lowqual': 'mp4',
2055     }
2056     IE_NAME = u'facebook'
2057
2058     def __init__(self, downloader=None):
2059         InfoExtractor.__init__(self, downloader)
2060
2061     def _reporter(self, message):
2062         """Add header and report message."""
2063         self._downloader.to_screen(u'[facebook] %s' % message)
2064
2065     def report_login(self):
2066         """Report attempt to log in."""
2067         self._reporter(u'Logging in')
2068
2069     def report_video_webpage_download(self, video_id):
2070         """Report attempt to download video webpage."""
2071         self._reporter(u'%s: Downloading video webpage' % video_id)
2072
2073     def report_information_extraction(self, video_id):
2074         """Report attempt to extract video information."""
2075         self._reporter(u'%s: Extracting video information' % video_id)
2076
2077     def _parse_page(self, video_webpage):
2078         """Extract video information from page"""
2079         # General data
2080         data = {'title': r'\("video_title", "(.*?)"\)',
2081             'description': r'<div class="datawrap">(.*?)</div>',
2082             'owner': r'\("video_owner_name", "(.*?)"\)',
2083             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2084             }
2085         video_info = {}
2086         for piece in data.keys():
2087             mobj = re.search(data[piece], video_webpage)
2088             if mobj is not None:
2089                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2090
2091         # Video urls
2092         video_urls = {}
2093         for fmt in self._available_formats:
2094             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2095             if mobj is not None:
2096                 # URL is in a Javascript segment inside an escaped Unicode format within
2097                 # the generally utf-8 page
2098                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2099         video_info['video_urls'] = video_urls
2100
2101         return video_info
2102
2103     def _real_initialize(self):
2104         if self._downloader is None:
2105             return
2106
2107         useremail = None
2108         password = None
2109         downloader_params = self._downloader.params
2110
2111         # Attempt to use provided username and password or .netrc data
2112         if downloader_params.get('username', None) is not None:
2113             useremail = downloader_params['username']
2114             password = downloader_params['password']
2115         elif downloader_params.get('usenetrc', False):
2116             try:
2117                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2118                 if info is not None:
2119                     useremail = info[0]
2120                     password = info[2]
2121                 else:
2122                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2123             except (IOError, netrc.NetrcParseError) as err:
2124                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2125                 return
2126
2127         if useremail is None:
2128             return
2129
2130         # Log in
2131         login_form = {
2132             'email': useremail,
2133             'pass': password,
2134             'login': 'Log+In'
2135             }
2136         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2137         try:
2138             self.report_login()
2139             login_results = compat_urllib_request.urlopen(request).read()
2140             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2141                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2142                 return
2143         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2144             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2145             return
2146
2147     def _real_extract(self, url):
2148         mobj = re.match(self._VALID_URL, url)
2149         if mobj is None:
2150             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2151             return
2152         video_id = mobj.group('ID')
2153
2154         # Get video webpage
2155         self.report_video_webpage_download(video_id)
2156         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2157         try:
2158             page = compat_urllib_request.urlopen(request)
2159             video_webpage = page.read()
2160         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2161             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2162             return
2163
2164         # Start extracting information
2165         self.report_information_extraction(video_id)
2166
2167         # Extract information
2168         video_info = self._parse_page(video_webpage)
2169
2170         # uploader
2171         if 'owner' not in video_info:
2172             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2173             return
2174         video_uploader = video_info['owner']
2175
2176         # title
2177         if 'title' not in video_info:
2178             self._downloader.trouble(u'ERROR: unable to extract video title')
2179             return
2180         video_title = video_info['title']
2181         video_title = video_title.decode('utf-8')
2182
2183         # thumbnail image
2184         if 'thumbnail' not in video_info:
2185             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2186             video_thumbnail = ''
2187         else:
2188             video_thumbnail = video_info['thumbnail']
2189
2190         # upload date
2191         upload_date = None
2192         if 'upload_date' in video_info:
2193             upload_time = video_info['upload_date']
2194             timetuple = email.utils.parsedate_tz(upload_time)
2195             if timetuple is not None:
2196                 try:
2197                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2198                 except:
2199                     pass
2200
2201         # description
2202         video_description = video_info.get('description', 'No description available.')
2203
2204         url_map = video_info['video_urls']
2205         if len(url_map.keys()) > 0:
2206             # Decide which formats to download
2207             req_format = self._downloader.params.get('format', None)
2208             format_limit = self._downloader.params.get('format_limit', None)
2209
2210             if format_limit is not None and format_limit in self._available_formats:
2211                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2212             else:
2213                 format_list = self._available_formats
2214             existing_formats = [x for x in format_list if x in url_map]
2215             if len(existing_formats) == 0:
2216                 self._downloader.trouble(u'ERROR: no known formats available for video')
2217                 return
2218             if req_format is None:
2219                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2220             elif req_format == 'worst':
2221                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2222             elif req_format == '-1':
2223                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2224             else:
2225                 # Specific format
2226                 if req_format not in url_map:
2227                     self._downloader.trouble(u'ERROR: requested format not available')
2228                     return
2229                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2230
2231         results = []
2232         for format_param, video_real_url in video_url_list:
2233             # Extension
2234             video_extension = self._video_extensions.get(format_param, 'mp4')
2235
2236             results.append({
2237                 'id':       video_id.decode('utf-8'),
2238                 'url':      video_real_url.decode('utf-8'),
2239                 'uploader': video_uploader.decode('utf-8'),
2240                 'upload_date':  upload_date,
2241                 'title':    video_title,
2242                 'ext':      video_extension.decode('utf-8'),
2243                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2244                 'thumbnail':    video_thumbnail.decode('utf-8'),
2245                 'description':  video_description.decode('utf-8'),
2246             })
2247         return results
2248
2249 class BlipTVIE(InfoExtractor):
2250     """Information extractor for blip.tv"""
2251
2252     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2253     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2254     IE_NAME = u'blip.tv'
2255
2256     def report_extraction(self, file_id):
2257         """Report information extraction."""
2258         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2259
2260     def report_direct_download(self, title):
2261         """Report information extraction."""
2262         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2263
2264     def _real_extract(self, url):
2265         mobj = re.match(self._VALID_URL, url)
2266         if mobj is None:
2267             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2268             return
2269
2270         if '?' in url:
2271             cchar = '&'
2272         else:
2273             cchar = '?'
2274         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2275         request = compat_urllib_request.Request(json_url)
2276         self.report_extraction(mobj.group(1))
2277         info = None
2278         try:
2279             urlh = compat_urllib_request.urlopen(request)
2280             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2281                 basename = url.split('/')[-1]
2282                 title,ext = os.path.splitext(basename)
2283                 title = title.decode('UTF-8')
2284                 ext = ext.replace('.', '')
2285                 self.report_direct_download(title)
2286                 info = {
2287                     'id': title,
2288                     'url': url,
2289                     'uploader': None,
2290                     'upload_date': None,
2291                     'title': title,
2292                     'ext': ext,
2293                     'urlhandle': urlh
2294                 }
2295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2296             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2297             return
2298         if info is None: # Regular URL
2299             try:
2300                 json_code_bytes = urlh.read()
2301                 json_code = json_code_bytes.decode('utf-8')
2302             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2303                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2304                 return
2305
2306             try:
2307                 json_data = json.loads(json_code)
2308                 if 'Post' in json_data:
2309                     data = json_data['Post']
2310                 else:
2311                     data = json_data
2312
2313                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2314                 video_url = data['media']['url']
2315                 umobj = re.match(self._URL_EXT, video_url)
2316                 if umobj is None:
2317                     raise ValueError('Can not determine filename extension')
2318                 ext = umobj.group(1)
2319
2320                 info = {
2321                     'id': data['item_id'],
2322                     'url': video_url,
2323                     'uploader': data['display_name'],
2324                     'upload_date': upload_date,
2325                     'title': data['title'],
2326                     'ext': ext,
2327                     'format': data['media']['mimeType'],
2328                     'thumbnail': data['thumbnailUrl'],
2329                     'description': data['description'],
2330                     'player_url': data['embedUrl']
2331                 }
2332             except (ValueError,KeyError) as err:
2333                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2334                 return
2335
2336         std_headers['User-Agent'] = 'iTunes/10.6.1'
2337         return [info]
2338
2339
2340 class MyVideoIE(InfoExtractor):
2341     """Information Extractor for myvideo.de."""
2342
2343     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2344     IE_NAME = u'myvideo'
2345
2346     def __init__(self, downloader=None):
2347         InfoExtractor.__init__(self, downloader)
2348     
2349     def report_download_webpage(self, video_id):
2350         """Report webpage download."""
2351         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2352
2353     def report_extraction(self, video_id):
2354         """Report information extraction."""
2355         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2356
2357     def _real_extract(self,url):
2358         mobj = re.match(self._VALID_URL, url)
2359         if mobj is None:
2360             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2361             return
2362
2363         video_id = mobj.group(1)
2364
2365         # Get video webpage
2366         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2367         try:
2368             self.report_download_webpage(video_id)
2369             webpage = compat_urllib_request.urlopen(request).read()
2370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2371             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2372             return
2373
2374         self.report_extraction(video_id)
2375         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2376                  webpage)
2377         if mobj is None:
2378             self._downloader.trouble(u'ERROR: unable to extract media URL')
2379             return
2380         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2381
2382         mobj = re.search('<title>([^<]+)</title>', webpage)
2383         if mobj is None:
2384             self._downloader.trouble(u'ERROR: unable to extract title')
2385             return
2386
2387         video_title = mobj.group(1)
2388
2389         return [{
2390             'id':       video_id,
2391             'url':      video_url,
2392             'uploader': None,
2393             'upload_date':  None,
2394             'title':    video_title,
2395             'ext':      u'flv',
2396         }]
2397
2398 class ComedyCentralIE(InfoExtractor):
2399     """Information extractor for The Daily Show and Colbert Report """
2400
2401     # urls can be abbreviations like :thedailyshow or :colbert
2402     # urls for episodes like: 
2403     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2404     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2405     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524    
2406     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2407                       |(https?://)?(www\.)?
2408                           (?P<showname>thedailyshow|colbertnation)\.com/
2409                          (full-episodes/(?P<episode>.*)|
2410                           (?P<clip>
2411                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2412                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2413                      $"""                        
2414     IE_NAME = u'comedycentral'
2415
2416     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2417
2418     _video_extensions = {
2419         '3500': 'mp4',
2420         '2200': 'mp4',
2421         '1700': 'mp4',
2422         '1200': 'mp4',
2423         '750': 'mp4',
2424         '400': 'mp4',
2425     }
2426     _video_dimensions = {
2427         '3500': '1280x720',
2428         '2200': '960x540',
2429         '1700': '768x432',
2430         '1200': '640x360',
2431         '750': '512x288',
2432         '400': '384x216',
2433     }
2434
2435     def suitable(self, url):
2436         """Receives a URL and returns True if suitable for this IE."""
2437         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2438
2439     def report_extraction(self, episode_id):
2440         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2441
2442     def report_config_download(self, episode_id):
2443         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2444
2445     def report_index_download(self, episode_id):
2446         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2447
2448     def report_player_url(self, episode_id):
2449         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2450
2451
2452     def _print_formats(self, formats):
2453         print('Available formats:')
2454         for x in formats:
2455             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2456
2457
2458     def _real_extract(self, url):
2459         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2460         if mobj is None:
2461             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2462             return
2463
2464         if mobj.group('shortname'):
2465             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2466                 url = u'http://www.thedailyshow.com/full-episodes/'
2467             else:
2468                 url = u'http://www.colbertnation.com/full-episodes/'
2469             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2470             assert mobj is not None
2471
2472         if mobj.group('clip'):
2473             if mobj.group('showname') == 'thedailyshow':
2474                 epTitle = mobj.group('tdstitle')
2475             else:
2476                 epTitle = mobj.group('cntitle')
2477             dlNewest = False
2478         else:
2479             dlNewest = not mobj.group('episode')
2480             if dlNewest:
2481                 epTitle = mobj.group('showname')
2482             else:
2483                 epTitle = mobj.group('episode')
2484
2485         req = compat_urllib_request.Request(url)
2486         self.report_extraction(epTitle)
2487         try:
2488             htmlHandle = compat_urllib_request.urlopen(req)
2489             html = htmlHandle.read()
2490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2492             return
2493         if dlNewest:
2494             url = htmlHandle.geturl()
2495             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2496             if mobj is None:
2497                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2498                 return
2499             if mobj.group('episode') == '':
2500                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2501                 return
2502             epTitle = mobj.group('episode')
2503
2504         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2505
2506         if len(mMovieParams) == 0:
2507             # The Colbert Report embeds the information in a without
2508             # a URL prefix; so extract the alternate reference
2509             # and then add the URL prefix manually.
2510
2511             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2512             if len(altMovieParams) == 0:
2513                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2514                 return
2515             else:
2516                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2517         
2518         playerUrl_raw = mMovieParams[0][0]
2519         self.report_player_url(epTitle)
2520         try:
2521             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2522             playerUrl = urlHandle.geturl()
2523         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2524             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2525             return
2526
2527         uri = mMovieParams[0][1]
2528         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2529         self.report_index_download(epTitle)
2530         try:
2531             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2532         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2534             return
2535
2536         results = []
2537
2538         idoc = xml.etree.ElementTree.fromstring(indexXml)
2539         itemEls = idoc.findall('.//item')
2540         for itemEl in itemEls:
2541             mediaId = itemEl.findall('./guid')[0].text
2542             shortMediaId = mediaId.split(':')[-1]
2543             showId = mediaId.split(':')[-2].replace('.com', '')
2544             officialTitle = itemEl.findall('./title')[0].text
2545             officialDate = itemEl.findall('./pubDate')[0].text
2546
2547             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2548                         compat_urllib_parse.urlencode({'uri': mediaId}))
2549             configReq = compat_urllib_request.Request(configUrl)
2550             self.report_config_download(epTitle)
2551             try:
2552                 configXml = compat_urllib_request.urlopen(configReq).read()
2553             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2555                 return
2556
2557             cdoc = xml.etree.ElementTree.fromstring(configXml)
2558             turls = []
2559             for rendition in cdoc.findall('.//rendition'):
2560                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2561                 turls.append(finfo)
2562
2563             if len(turls) == 0:
2564                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2565                 continue
2566             
2567             if self._downloader.params.get('listformats', None):
2568                 self._print_formats([i[0] for i in turls])
2569                 return
2570
2571             # For now, just pick the highest bitrate
2572             format,video_url = turls[-1]
2573
2574             # Get the format arg from the arg stream
2575             req_format = self._downloader.params.get('format', None)
2576
2577             # Select format if we can find one
2578             for f,v in turls:
2579                 if f == req_format:
2580                     format, video_url = f, v
2581                     break
2582
2583             # Patch to download from alternative CDN, which does not
2584             # break on current RTMPDump builds
2585             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2586             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2587
2588             if video_url.startswith(broken_cdn):
2589                 video_url = video_url.replace(broken_cdn, better_cdn)
2590
2591             effTitle = showId + u'-' + epTitle
2592             info = {
2593                 'id': shortMediaId,
2594                 'url': video_url,
2595                 'uploader': showId,
2596                 'upload_date': officialDate,
2597                 'title': effTitle,
2598                 'ext': 'mp4',
2599                 'format': format,
2600                 'thumbnail': None,
2601                 'description': officialTitle,
2602                 'player_url': None #playerUrl
2603             }
2604
2605             results.append(info)
2606             
2607         return results
2608
2609
2610 class EscapistIE(InfoExtractor):
2611     """Information extractor for The Escapist """
2612
2613     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2614     IE_NAME = u'escapist'
2615
2616     def report_extraction(self, showName):
2617         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2618
2619     def report_config_download(self, showName):
2620         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2621
2622     def _real_extract(self, url):
2623         mobj = re.match(self._VALID_URL, url)
2624         if mobj is None:
2625             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2626             return
2627         showName = mobj.group('showname')
2628         videoId = mobj.group('episode')
2629
2630         self.report_extraction(showName)
2631         try:
2632             webPage = compat_urllib_request.urlopen(url)
2633             webPageBytes = webPage.read()
2634             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2635             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2636         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2637             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2638             return
2639
2640         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2641         description = unescapeHTML(descMatch.group(1))
2642         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2643         imgUrl = unescapeHTML(imgMatch.group(1))
2644         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2645         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2646         configUrlMatch = re.search('config=(.*)$', playerUrl)
2647         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2648
2649         self.report_config_download(showName)
2650         try:
2651             configJSON = compat_urllib_request.urlopen(configUrl).read()
2652         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2654             return
2655
2656         # Technically, it's JavaScript, not JSON
2657         configJSON = configJSON.replace("'", '"')
2658
2659         try:
2660             config = json.loads(configJSON)
2661         except (ValueError,) as err:
2662             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2663             return
2664
2665         playlist = config['playlist']
2666         videoUrl = playlist[1]['url']
2667
2668         info = {
2669             'id': videoId,
2670             'url': videoUrl,
2671             'uploader': showName,
2672             'upload_date': None,
2673             'title': showName,
2674             'ext': 'flv',
2675             'thumbnail': imgUrl,
2676             'description': description,
2677             'player_url': playerUrl,
2678         }
2679
2680         return [info]
2681
2682
2683 class CollegeHumorIE(InfoExtractor):
2684     """Information extractor for collegehumor.com"""
2685
2686     _WORKING = False
2687     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2688     IE_NAME = u'collegehumor'
2689
2690     def report_manifest(self, video_id):
2691         """Report information extraction."""
2692         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2693
2694     def report_extraction(self, video_id):
2695         """Report information extraction."""
2696         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2697
2698     def _real_extract(self, url):
2699         mobj = re.match(self._VALID_URL, url)
2700         if mobj is None:
2701             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2702             return
2703         video_id = mobj.group('videoid')
2704
2705         info = {
2706             'id': video_id,
2707             'uploader': None,
2708             'upload_date': None,
2709         }
2710
2711         self.report_extraction(video_id)
2712         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2713         try:
2714             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2716             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2717             return
2718
2719         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2720         try:
2721             videoNode = mdoc.findall('./video')[0]
2722             info['description'] = videoNode.findall('./description')[0].text
2723             info['title'] = videoNode.findall('./caption')[0].text
2724             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2725             manifest_url = videoNode.findall('./file')[0].text
2726         except IndexError:
2727             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2728             return
2729
2730         manifest_url += '?hdcore=2.10.3'
2731         self.report_manifest(video_id)
2732         try:
2733             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2734         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2735             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2736             return
2737
2738         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2739         try:
2740             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2741             node_id = media_node.attrib['url']
2742             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2743         except IndexError as err:
2744             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2745             return
2746
2747         url_pr = compat_urllib_parse_urlparse(manifest_url)
2748         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2749
2750         info['url'] = url
2751         info['ext'] = 'f4f'
2752         return [info]
2753
2754
2755 class XVideosIE(InfoExtractor):
2756     """Information extractor for xvideos.com"""
2757
2758     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2759     IE_NAME = u'xvideos'
2760
2761     def report_webpage(self, video_id):
2762         """Report information extraction."""
2763         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2764
2765     def report_extraction(self, video_id):
2766         """Report information extraction."""
2767         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2768
2769     def _real_extract(self, url):
2770         mobj = re.match(self._VALID_URL, url)
2771         if mobj is None:
2772             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2773             return
2774         video_id = mobj.group(1)
2775
2776         self.report_webpage(video_id)
2777
2778         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2779         try:
2780             webpage_bytes = compat_urllib_request.urlopen(request).read()
2781             webpage = webpage_bytes.decode('utf-8', 'replace')
2782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2784             return
2785
2786         self.report_extraction(video_id)
2787
2788
2789         # Extract video URL
2790         mobj = re.search(r'flv_url=(.+?)&', webpage)
2791         if mobj is None:
2792             self._downloader.trouble(u'ERROR: unable to extract video url')
2793             return
2794         video_url = compat_urllib_parse.unquote(mobj.group(1))
2795
2796
2797         # Extract title
2798         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2799         if mobj is None:
2800             self._downloader.trouble(u'ERROR: unable to extract video title')
2801             return
2802         video_title = mobj.group(1)
2803
2804
2805         # Extract video thumbnail
2806         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2807         if mobj is None:
2808             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2809             return
2810         video_thumbnail = mobj.group(0)
2811
2812         info = {
2813             'id': video_id,
2814             'url': video_url,
2815             'uploader': None,
2816             'upload_date': None,
2817             'title': video_title,
2818             'ext': 'flv',
2819             'thumbnail': video_thumbnail,
2820             'description': None,
2821         }
2822
2823         return [info]
2824
2825
2826 class SoundcloudIE(InfoExtractor):
2827     """Information extractor for soundcloud.com
2828        To access the media, the uid of the song and a stream token
2829        must be extracted from the page source and the script must make
2830        a request to media.soundcloud.com/crossdomain.xml. Then
2831        the media can be grabbed by requesting from an url composed
2832        of the stream token and uid
2833      """
2834
2835     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2836     IE_NAME = u'soundcloud'
2837
2838     def __init__(self, downloader=None):
2839         InfoExtractor.__init__(self, downloader)
2840
2841     def report_resolve(self, video_id):
2842         """Report information extraction."""
2843         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2844
2845     def report_extraction(self, video_id):
2846         """Report information extraction."""
2847         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2848
2849     def _real_extract(self, url):
2850         mobj = re.match(self._VALID_URL, url)
2851         if mobj is None:
2852             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2853             return
2854
2855         # extract uploader (which is in the url)
2856         uploader = mobj.group(1)
2857         # extract simple title (uploader + slug of song title)
2858         slug_title =  mobj.group(2)
2859         simple_title = uploader + u'-' + slug_title
2860
2861         self.report_resolve('%s/%s' % (uploader, slug_title))
2862
2863         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2864         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2865         request = compat_urllib_request.Request(resolv_url)
2866         try:
2867             info_json_bytes = compat_urllib_request.urlopen(request).read()
2868             info_json = info_json_bytes.decode('utf-8')
2869         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2871             return
2872
2873         info = json.loads(info_json)
2874         video_id = info['id']
2875         self.report_extraction('%s/%s' % (uploader, slug_title))
2876
2877         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2878         request = compat_urllib_request.Request(streams_url)
2879         try:
2880             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2881             stream_json = stream_json_bytes.decode('utf-8')
2882         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2883             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2884             return
2885
2886         streams = json.loads(stream_json)
2887         mediaURL = streams['http_mp3_128_url']
2888
2889         return [{
2890             'id':       info['id'],
2891             'url':      mediaURL,
2892             'uploader': info['user']['username'],
2893             'upload_date':  info['created_at'],
2894             'title':    info['title'],
2895             'ext':      u'mp3',
2896             'description': info['description'],
2897         }]
2898
2899
2900 class InfoQIE(InfoExtractor):
2901     """Information extractor for infoq.com"""
2902
2903     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2904     IE_NAME = u'infoq'
2905
2906     def report_webpage(self, video_id):
2907         """Report information extraction."""
2908         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2909
2910     def report_extraction(self, video_id):
2911         """Report information extraction."""
2912         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2913
2914     def _real_extract(self, url):
2915         mobj = re.match(self._VALID_URL, url)
2916         if mobj is None:
2917             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2918             return
2919
2920         self.report_webpage(url)
2921
2922         request = compat_urllib_request.Request(url)
2923         try:
2924             webpage = compat_urllib_request.urlopen(request).read()
2925         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2927             return
2928
2929         self.report_extraction(url)
2930
2931
2932         # Extract video URL
2933         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2934         if mobj is None:
2935             self._downloader.trouble(u'ERROR: unable to extract video url')
2936             return
2937         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2938
2939
2940         # Extract title
2941         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2942         if mobj is None:
2943             self._downloader.trouble(u'ERROR: unable to extract video title')
2944             return
2945         video_title = mobj.group(1).decode('utf-8')
2946
2947         # Extract description
2948         video_description = u'No description available.'
2949         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2950         if mobj is not None:
2951             video_description = mobj.group(1).decode('utf-8')
2952
2953         video_filename = video_url.split('/')[-1]
2954         video_id, extension = video_filename.split('.')
2955
2956         info = {
2957             'id': video_id,
2958             'url': video_url,
2959             'uploader': None,
2960             'upload_date': None,
2961             'title': video_title,
2962             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2963             'thumbnail': None,
2964             'description': video_description,
2965         }
2966
2967         return [info]
2968
2969 class MixcloudIE(InfoExtractor):
2970     """Information extractor for www.mixcloud.com"""
2971     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2972     IE_NAME = u'mixcloud'
2973
2974     def __init__(self, downloader=None):
2975         InfoExtractor.__init__(self, downloader)
2976
2977     def report_download_json(self, file_id):
2978         """Report JSON download."""
2979         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2980
2981     def report_extraction(self, file_id):
2982         """Report information extraction."""
2983         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2984
2985     def get_urls(self, jsonData, fmt, bitrate='best'):
2986         """Get urls from 'audio_formats' section in json"""
2987         file_url = None
2988         try:
2989             bitrate_list = jsonData[fmt]
2990             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2991                 bitrate = max(bitrate_list) # select highest
2992
2993             url_list = jsonData[fmt][bitrate]
2994         except TypeError: # we have no bitrate info.
2995             url_list = jsonData[fmt]
2996         return url_list
2997
2998     def check_urls(self, url_list):
2999         """Returns 1st active url from list"""
3000         for url in url_list:
3001             try:
3002                 compat_urllib_request.urlopen(url)
3003                 return url
3004             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3005                 url = None
3006
3007         return None
3008
3009     def _print_formats(self, formats):
3010         print('Available formats:')
3011         for fmt in formats.keys():
3012             for b in formats[fmt]:
3013                 try:
3014                     ext = formats[fmt][b][0]
3015                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3016                 except TypeError: # we have no bitrate info
3017                     ext = formats[fmt][0]
3018                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3019                     break
3020
3021     def _real_extract(self, url):
3022         mobj = re.match(self._VALID_URL, url)
3023         if mobj is None:
3024             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3025             return
3026         # extract uploader & filename from url
3027         uploader = mobj.group(1).decode('utf-8')
3028         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3029
3030         # construct API request
3031         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3032         # retrieve .json file with links to files
3033         request = compat_urllib_request.Request(file_url)
3034         try:
3035             self.report_download_json(file_url)
3036             jsonData = compat_urllib_request.urlopen(request).read()
3037         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3038             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3039             return
3040
3041         # parse JSON
3042         json_data = json.loads(jsonData)
3043         player_url = json_data['player_swf_url']
3044         formats = dict(json_data['audio_formats'])
3045
3046         req_format = self._downloader.params.get('format', None)
3047         bitrate = None
3048
3049         if self._downloader.params.get('listformats', None):
3050             self._print_formats(formats)
3051             return
3052
3053         if req_format is None or req_format == 'best':
3054             for format_param in formats.keys():
3055                 url_list = self.get_urls(formats, format_param)
3056                 # check urls
3057                 file_url = self.check_urls(url_list)
3058                 if file_url is not None:
3059                     break # got it!
3060         else:
3061             if req_format not in formats.keys():
3062                 self._downloader.trouble(u'ERROR: format is not available')
3063                 return
3064
3065             url_list = self.get_urls(formats, req_format)
3066             file_url = self.check_urls(url_list)
3067             format_param = req_format
3068
3069         return [{
3070             'id': file_id.decode('utf-8'),
3071             'url': file_url.decode('utf-8'),
3072             'uploader': uploader.decode('utf-8'),
3073             'upload_date': None,
3074             'title': json_data['name'],
3075             'ext': file_url.split('.')[-1].decode('utf-8'),
3076             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3077             'thumbnail': json_data['thumbnail_url'],
3078             'description': json_data['description'],
3079             'player_url': player_url.decode('utf-8'),
3080         }]
3081
3082 class StanfordOpenClassroomIE(InfoExtractor):
3083     """Information extractor for Stanford's Open ClassRoom"""
3084
3085     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3086     IE_NAME = u'stanfordoc'
3087
3088     def report_download_webpage(self, objid):
3089         """Report information extraction."""
3090         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3091
3092     def report_extraction(self, video_id):
3093         """Report information extraction."""
3094         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3095
3096     def _real_extract(self, url):
3097         mobj = re.match(self._VALID_URL, url)
3098         if mobj is None:
3099             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3100             return
3101
3102         if mobj.group('course') and mobj.group('video'): # A specific video
3103             course = mobj.group('course')
3104             video = mobj.group('video')
3105             info = {
3106                 'id': course + '_' + video,
3107                 'uploader': None,
3108                 'upload_date': None,
3109             }
3110
3111             self.report_extraction(info['id'])
3112             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3113             xmlUrl = baseUrl + video + '.xml'
3114             try:
3115                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3116             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3117                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3118                 return
3119             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3120             try:
3121                 info['title'] = mdoc.findall('./title')[0].text
3122                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3123             except IndexError:
3124                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3125                 return
3126             info['ext'] = info['url'].rpartition('.')[2]
3127             return [info]
3128         elif mobj.group('course'): # A course page
3129             course = mobj.group('course')
3130             info = {
3131                 'id': course,
3132                 'type': 'playlist',
3133                 'uploader': None,
3134                 'upload_date': None,
3135             }
3136
3137             self.report_download_webpage(info['id'])
3138             try:
3139                 coursepage = compat_urllib_request.urlopen(url).read()
3140             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3141                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3142                 return
3143
3144             m = re.search('<h1>([^<]+)</h1>', coursepage)
3145             if m:
3146                 info['title'] = unescapeHTML(m.group(1))
3147             else:
3148                 info['title'] = info['id']
3149
3150             m = re.search('<description>([^<]+)</description>', coursepage)
3151             if m:
3152                 info['description'] = unescapeHTML(m.group(1))
3153
3154             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3155             info['list'] = [
3156                 {
3157                     'type': 'reference',
3158                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3159                 }
3160                     for vpage in links]
3161             results = []
3162             for entry in info['list']:
3163                 assert entry['type'] == 'reference'
3164                 results += self.extract(entry['url'])
3165             return results
3166             
3167         else: # Root page
3168             info = {
3169                 'id': 'Stanford OpenClassroom',
3170                 'type': 'playlist',
3171                 'uploader': None,
3172                 'upload_date': None,
3173             }
3174
3175             self.report_download_webpage(info['id'])
3176             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3177             try:
3178                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3179             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3180                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3181                 return
3182
3183             info['title'] = info['id']
3184
3185             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3186             info['list'] = [
3187                 {
3188                     'type': 'reference',
3189                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3190                 }
3191                     for cpage in links]
3192
3193             results = []
3194             for entry in info['list']:
3195                 assert entry['type'] == 'reference'
3196                 results += self.extract(entry['url'])
3197             return results
3198
3199 class MTVIE(InfoExtractor):
3200     """Information extractor for MTV.com"""
3201
3202     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3203     IE_NAME = u'mtv'
3204
3205     def report_webpage(self, video_id):
3206         """Report information extraction."""
3207         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3208
3209     def report_extraction(self, video_id):
3210         """Report information extraction."""
3211         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3212
3213     def _real_extract(self, url):
3214         mobj = re.match(self._VALID_URL, url)
3215         if mobj is None:
3216             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3217             return
3218         if not mobj.group('proto'):
3219             url = 'http://' + url
3220         video_id = mobj.group('videoid')
3221         self.report_webpage(video_id)
3222
3223         request = compat_urllib_request.Request(url)
3224         try:
3225             webpage = compat_urllib_request.urlopen(request).read()
3226         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3227             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3228             return
3229
3230         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3231         if mobj is None:
3232             self._downloader.trouble(u'ERROR: unable to extract song name')
3233             return
3234         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3235         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3236         if mobj is None:
3237             self._downloader.trouble(u'ERROR: unable to extract performer')
3238             return
3239         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3240         video_title = performer + ' - ' + song_name 
3241
3242         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3243         if mobj is None:
3244             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3245             return
3246         mtvn_uri = mobj.group(1)
3247
3248         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3249         if mobj is None:
3250             self._downloader.trouble(u'ERROR: unable to extract content id')
3251             return
3252         content_id = mobj.group(1)
3253
3254         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3255         self.report_extraction(video_id)
3256         request = compat_urllib_request.Request(videogen_url)
3257         try:
3258             metadataXml = compat_urllib_request.urlopen(request).read()
3259         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3260             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3261             return
3262
3263         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3264         renditions = mdoc.findall('.//rendition')
3265
3266         # For now, always pick the highest quality.
3267         rendition = renditions[-1]
3268
3269         try:
3270             _,_,ext = rendition.attrib['type'].partition('/')
3271             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3272             video_url = rendition.find('./src').text
3273         except KeyError:
3274             self._downloader.trouble('Invalid rendition field.')
3275             return
3276
3277         info = {
3278             'id': video_id,
3279             'url': video_url,
3280             'uploader': performer,
3281             'upload_date': None,
3282             'title': video_title,
3283             'ext': ext,
3284             'format': format,
3285         }
3286
3287         return [info]
3288
3289
3290 class YoukuIE(InfoExtractor):
3291
3292     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3293     IE_NAME = u'Youku'
3294
3295     def __init__(self, downloader=None):
3296         InfoExtractor.__init__(self, downloader)
3297
3298     def report_download_webpage(self, file_id):
3299         """Report webpage download."""
3300         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3301
3302     def report_extraction(self, file_id):
3303         """Report information extraction."""
3304         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3305
3306     def _gen_sid(self):
3307         nowTime = int(time.time() * 1000)
3308         random1 = random.randint(1000,1998)
3309         random2 = random.randint(1000,9999)
3310
3311         return "%d%d%d" %(nowTime,random1,random2)
3312
3313     def _get_file_ID_mix_string(self, seed):
3314         mixed = []
3315         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3316         seed = float(seed)
3317         for i in range(len(source)):
3318             seed  =  (seed * 211 + 30031 ) % 65536
3319             index  =  math.floor(seed / 65536 * len(source) )
3320             mixed.append(source[int(index)])
3321             source.remove(source[int(index)])
3322         #return ''.join(mixed)
3323         return mixed
3324
3325     def _get_file_id(self, fileId, seed):
3326         mixed = self._get_file_ID_mix_string(seed)
3327         ids = fileId.split('*')
3328         realId = []
3329         for ch in ids:
3330             if ch:
3331                 realId.append(mixed[int(ch)])
3332         return ''.join(realId)
3333
3334     def _real_extract(self, url):
3335         mobj = re.match(self._VALID_URL, url)
3336         if mobj is None:
3337             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3338             return
3339         video_id = mobj.group('ID')
3340
3341         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3342
3343         request = compat_urllib_request.Request(info_url, None, std_headers)
3344         try:
3345             self.report_download_webpage(video_id)
3346             jsondata = compat_urllib_request.urlopen(request).read()
3347         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3348             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3349             return
3350
3351         self.report_extraction(video_id)
3352         try:
3353             jsonstr = jsondata.decode('utf-8')
3354             config = json.loads(jsonstr)
3355
3356             video_title =  config['data'][0]['title']
3357             seed = config['data'][0]['seed']
3358
3359             format = self._downloader.params.get('format', None)
3360             supported_format = config['data'][0]['streamfileids'].keys()
3361
3362             if format is None or format == 'best':
3363                 if 'hd2' in supported_format:
3364                     format = 'hd2'
3365                 else:
3366                     format = 'flv'
3367                 ext = u'flv'
3368             elif format == 'worst':
3369                 format = 'mp4'
3370                 ext = u'mp4'
3371             else:
3372                 format = 'flv'
3373                 ext = u'flv'
3374
3375
3376             fileid = config['data'][0]['streamfileids'][format]
3377             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3378         except (UnicodeDecodeError, ValueError, KeyError):
3379             self._downloader.trouble(u'ERROR: unable to extract info section')
3380             return
3381
3382         files_info=[]
3383         sid = self._gen_sid()
3384         fileid = self._get_file_id(fileid, seed)
3385
3386         #column 8,9 of fileid represent the segment number
3387         #fileid[7:9] should be changed
3388         for index, key in enumerate(keys):
3389
3390             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3391             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3392
3393             info = {
3394                 'id': '%s_part%02d' % (video_id, index),
3395                 'url': download_url,
3396                 'uploader': None,
3397                 'upload_date': None,
3398                 'title': video_title,
3399                 'ext': ext,
3400             }
3401             files_info.append(info)
3402
3403         return files_info
3404
3405
3406 class XNXXIE(InfoExtractor):
3407     """Information extractor for xnxx.com"""
3408
3409     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3410     IE_NAME = u'xnxx'
3411     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3412     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3413     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3414
3415     def report_webpage(self, video_id):
3416         """Report information extraction"""
3417         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3418
3419     def report_extraction(self, video_id):
3420         """Report information extraction"""
3421         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3422
3423     def _real_extract(self, url):
3424         mobj = re.match(self._VALID_URL, url)
3425         if mobj is None:
3426             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3427             return
3428         video_id = mobj.group(1)
3429
3430         self.report_webpage(video_id)
3431
3432         # Get webpage content
3433         try:
3434             webpage_bytes = compat_urllib_request.urlopen(url).read()
3435             webpage = webpage_bytes.decode('utf-8')
3436         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3437             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3438             return
3439
3440         result = re.search(self.VIDEO_URL_RE, webpage)
3441         if result is None:
3442             self._downloader.trouble(u'ERROR: unable to extract video url')
3443             return
3444         video_url = compat_urllib_parse.unquote(result.group(1))
3445
3446         result = re.search(self.VIDEO_TITLE_RE, webpage)
3447         if result is None:
3448             self._downloader.trouble(u'ERROR: unable to extract video title')
3449             return
3450         video_title = result.group(1)
3451
3452         result = re.search(self.VIDEO_THUMB_RE, webpage)
3453         if result is None:
3454             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3455             return
3456         video_thumbnail = result.group(1)
3457
3458         return [{
3459             'id': video_id,
3460             'url': video_url,
3461             'uploader': None,
3462             'upload_date': None,
3463             'title': video_title,
3464             'ext': 'flv',
3465             'thumbnail': video_thumbnail,
3466             'description': None,
3467         }]
3468
3469
3470 class GooglePlusIE(InfoExtractor):
3471     """Information extractor for plus.google.com."""
3472
3473     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3474     IE_NAME = u'plus.google'
3475
3476     def __init__(self, downloader=None):
3477         InfoExtractor.__init__(self, downloader)
3478
3479     def report_extract_entry(self, url):
3480         """Report downloading extry"""
3481         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3482
3483     def report_date(self, upload_date):
3484         """Report downloading extry"""
3485         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3486
3487     def report_uploader(self, uploader):
3488         """Report downloading extry"""
3489         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3490
3491     def report_title(self, video_title):
3492         """Report downloading extry"""
3493         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3494
3495     def report_extract_vid_page(self, video_page):
3496         """Report information extraction."""
3497         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3498
3499     def _real_extract(self, url):
3500         # Extract id from URL
3501         mobj = re.match(self._VALID_URL, url)
3502         if mobj is None:
3503             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3504             return
3505
3506         post_url = mobj.group(0)
3507         video_id = mobj.group(2)
3508
3509         video_extension = 'flv'
3510
3511         # Step 1, Retrieve post webpage to extract further information
3512         self.report_extract_entry(post_url)
3513         request = compat_urllib_request.Request(post_url)
3514         try:
3515             webpage = compat_urllib_request.urlopen(request).read()
3516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3517             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3518             return
3519
3520         # Extract update date
3521         upload_date = None
3522         pattern = 'title="Timestamp">(.*?)</a>'
3523         mobj = re.search(pattern, webpage)
3524         if mobj:
3525             upload_date = mobj.group(1)
3526             # Convert timestring to a format suitable for filename
3527             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3528             upload_date = upload_date.strftime('%Y%m%d')
3529         self.report_date(upload_date)
3530
3531         # Extract uploader
3532         uploader = None
3533         pattern = r'rel\="author".*?>(.*?)</a>'
3534         mobj = re.search(pattern, webpage)
3535         if mobj:
3536             uploader = mobj.group(1)
3537         self.report_uploader(uploader)
3538
3539         # Extract title
3540         # Get the first line for title
3541         video_title = u'NA'
3542         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3543         mobj = re.search(pattern, webpage)
3544         if mobj:
3545             video_title = mobj.group(1)
3546         self.report_title(video_title)
3547
3548         # Step 2, Stimulate clicking the image box to launch video
3549         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3550         mobj = re.search(pattern, webpage)
3551         if mobj is None:
3552             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3553
3554         video_page = mobj.group(1)
3555         request = compat_urllib_request.Request(video_page)
3556         try:
3557             webpage = compat_urllib_request.urlopen(request).read()
3558         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3559             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3560             return
3561         self.report_extract_vid_page(video_page)
3562
3563
3564         # Extract video links on video page
3565         """Extract video links of all sizes"""
3566         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3567         mobj = re.findall(pattern, webpage)
3568         if len(mobj) == 0:
3569             self._downloader.trouble(u'ERROR: unable to extract video links')
3570
3571         # Sort in resolution
3572         links = sorted(mobj)
3573
3574         # Choose the lowest of the sort, i.e. highest resolution
3575         video_url = links[-1]
3576         # Only get the url. The resolution part in the tuple has no use anymore
3577         video_url = video_url[-1]
3578         # Treat escaped \u0026 style hex
3579         video_url = unicode(video_url, "unicode_escape")
3580
3581
3582         return [{
3583             'id':       video_id.decode('utf-8'),
3584             'url':      video_url,
3585             'uploader': uploader.decode('utf-8'),
3586             'upload_date':  upload_date.decode('utf-8'),
3587             'title':    video_title.decode('utf-8'),
3588             'ext':      video_extension.decode('utf-8'),
3589         }]
3590
3591 class NBAIE(InfoExtractor):
3592     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3593     IE_NAME = u'nba'
3594
3595     def report_extraction(self, video_id):
3596         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3597
3598     def _real_extract(self, url):
3599         mobj = re.match(self._VALID_URL, url)
3600         if mobj is None:
3601             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3602             return
3603
3604         video_id = mobj.group(1)
3605         if video_id.endswith('/index.html'):
3606             video_id = video_id[:-len('/index.html')]
3607
3608         self.report_extraction(video_id)
3609         try:
3610             urlh = compat_urllib_request.urlopen(url)
3611             webpage_bytes = urlh.read()
3612             webpage = webpage_bytes.decode('utf-8', 'ignore')
3613         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3614             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3615             return
3616
3617         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3618         def _findProp(rexp, default=None):
3619             m = re.search(rexp, webpage)
3620             if m:
3621                 return unescapeHTML(m.group(1))
3622             else:
3623                 return default
3624
3625         shortened_video_id = video_id.rpartition('/')[2]
3626         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3627         info = {
3628             'id': shortened_video_id,
3629             'url': video_url,
3630             'ext': 'mp4',
3631             'title': title,
3632             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3633             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3634         }
3635         return [info]