various py3 fixes; all tests green on 3.3
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import datetime
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import email.utils
13 import xml.etree.ElementTree
14 import random
15 import math
16
17 from .utils import *
18
19
20 class InfoExtractor(object):
21     """Information Extractor class.
22
23     Information extractors are the classes that, given a URL, extract
24     information about the video (or videos) the URL refers to. This
25     information includes the real video URL, the video title, author and
26     others. The information is stored in a dictionary which is then 
27     passed to the FileDownloader. The FileDownloader processes this
28     information possibly downloading the video to the file system, among
29     other possible outcomes.
30
31     The dictionaries must include the following fields:
32
33     id:             Video identifier.
34     url:            Final video URL.
35     uploader:       Nickname of the video uploader, unescaped.
36     upload_date:    Video upload date (YYYYMMDD).
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     player_url:     SWF Player URL (used for rtmpdump).
46     subtitles:      The .srt file contents.
47     urlhandle:      [internal] The urlHandle to be used to download the file,
48                     like returned by urllib.request.urlopen
49
50     The fields should all be Unicode strings.
51
52     Subclasses of this one should re-define the _real_initialize() and
53     _real_extract() methods and define a _VALID_URL regexp.
54     Probably, they should also be added to the list of extractors.
55
56     _real_extract() must return a *list* of information dictionaries as
57     described above.
58
59     Finally, the _WORKING attribute should be set to False for broken IEs
60     in order to warn the users and skip the tests.
61     """
62
63     _ready = False
64     _downloader = None
65     _WORKING = True
66
67     def __init__(self, downloader=None):
68         """Constructor. Receives an optional downloader."""
69         self._ready = False
70         self.set_downloader(downloader)
71
72     def suitable(self, url):
73         """Receives a URL and returns True if suitable for this IE."""
74         return re.match(self._VALID_URL, url) is not None
75
76     def working(self):
77         """Getter method for _WORKING."""
78         return self._WORKING
79
80     def initialize(self):
81         """Initializes an instance (authentication, etc)."""
82         if not self._ready:
83             self._real_initialize()
84             self._ready = True
85
86     def extract(self, url):
87         """Extracts URL information and returns it in list of dicts."""
88         self.initialize()
89         return self._real_extract(url)
90
91     def set_downloader(self, downloader):
92         """Sets the downloader for this IE."""
93         self._downloader = downloader
94
95     def _real_initialize(self):
96         """Real initialization process. Redefine in subclasses."""
97         pass
98
99     def _real_extract(self, url):
100         """Real extraction process. Redefine in subclasses."""
101         pass
102
103
104 class YoutubeIE(InfoExtractor):
105     """Information extractor for youtube.com."""
106
107     _VALID_URL = r"""^
108                      (
109                          (?:https?://)?                                       # http(s):// (optional)
110                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
112                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
113                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
114                          (?:                                                  # the various things that can precede the ID:
115                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
116                              |(?:                                             # or the v= param in all its forms
117                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
119                                  (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
120                                  v=
121                              )
122                          )?                                                   # optional -> youtube.com/xxxx is OK
123                      )?                                                       # all until now is optional -> you can pass the naked ID
124                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
125                      (?(1).+)?                                                # if we found the ID, everything can follow
126                      $"""
127     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131     _NETRC_MACHINE = 'youtube'
132     # Listed in order of quality
133     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135     _video_extensions = {
136         '13': '3gp',
137         '17': 'mp4',
138         '18': 'mp4',
139         '22': 'mp4',
140         '37': 'mp4',
141         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
142         '43': 'webm',
143         '44': 'webm',
144         '45': 'webm',
145         '46': 'webm',
146     }
147     _video_dimensions = {
148         '5': '240x400',
149         '6': '???',
150         '13': '???',
151         '17': '144x176',
152         '18': '360x640',
153         '22': '720x1280',
154         '34': '360x640',
155         '35': '480x854',
156         '37': '1080x1920',
157         '38': '3072x4096',
158         '43': '360x640',
159         '44': '480x854',
160         '45': '720x1280',
161         '46': '1080x1920',
162     }   
163     IE_NAME = u'youtube'
164
165     def suitable(self, url):
166         """Receives a URL and returns True if suitable for this IE."""
167         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
168
169     def report_lang(self):
170         """Report attempt to set language."""
171         self._downloader.to_screen(u'[youtube] Setting language')
172
173     def report_login(self):
174         """Report attempt to log in."""
175         self._downloader.to_screen(u'[youtube] Logging in')
176
177     def report_age_confirmation(self):
178         """Report attempt to confirm age."""
179         self._downloader.to_screen(u'[youtube] Confirming age')
180
181     def report_video_webpage_download(self, video_id):
182         """Report attempt to download video webpage."""
183         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
184
185     def report_video_info_webpage_download(self, video_id):
186         """Report attempt to download video info webpage."""
187         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
188
189     def report_video_subtitles_download(self, video_id):
190         """Report attempt to download video info webpage."""
191         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
192
193     def report_information_extraction(self, video_id):
194         """Report attempt to extract video information."""
195         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
196
197     def report_unavailable_format(self, video_id, format):
198         """Report extracted video URL."""
199         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
200
201     def report_rtmp_download(self):
202         """Indicate the download will use the RTMP protocol."""
203         self._downloader.to_screen(u'[youtube] RTMP download detected')
204
205     def _closed_captions_xml_to_srt(self, xml_string):
206         srt = ''
207         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208         # TODO parse xml instead of regex
209         for n, (start, dur_tag, dur, caption) in enumerate(texts):
210             if not dur: dur = '4'
211             start = float(start)
212             end = start + float(dur)
213             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215             caption = unescapeHTML(caption)
216             caption = unescapeHTML(caption) # double cycle, intentional
217             srt += str(n+1) + '\n'
218             srt += start + ' --> ' + end + '\n'
219             srt += caption + '\n\n'
220         return srt
221
222     def _print_formats(self, formats):
223         print('Available formats:')
224         for x in formats:
225             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
226
227     def _real_initialize(self):
228         if self._downloader is None:
229             return
230
231         username = None
232         password = None
233         downloader_params = self._downloader.params
234
235         # Attempt to use provided username and password or .netrc data
236         if downloader_params.get('username', None) is not None:
237             username = downloader_params['username']
238             password = downloader_params['password']
239         elif downloader_params.get('usenetrc', False):
240             try:
241                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
242                 if info is not None:
243                     username = info[0]
244                     password = info[2]
245                 else:
246                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247             except (IOError, netrc.NetrcParseError) as err:
248                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
249                 return
250
251         # Set language
252         request = compat_urllib_request.Request(self._LANG_URL)
253         try:
254             self.report_lang()
255             compat_urllib_request.urlopen(request).read()
256         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
258             return
259
260         # No authentication to be performed
261         if username is None:
262             return
263
264         # Log in
265         login_form = {
266                 'current_form': 'loginForm',
267                 'next':     '/',
268                 'action_login': 'Log In',
269                 'username': username,
270                 'password': password,
271                 }
272         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
273         try:
274             self.report_login()
275             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
276             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
278                 return
279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
281             return
282
283         # Confirm age
284         age_form = {
285                 'next_url':     '/',
286                 'action_confirm':   'Confirm',
287                 }
288         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
289         try:
290             self.report_age_confirmation()
291             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
294             return
295
296     def _real_extract(self, url):
297         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298         mobj = re.search(self._NEXT_URL_RE, url)
299         if mobj:
300             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
301
302         # Extract video id from URL
303         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
304         if mobj is None:
305             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
306             return
307         video_id = mobj.group(2)
308
309         # Get video webpage
310         self.report_video_webpage_download(video_id)
311         request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
312         try:
313             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
316             return
317
318         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
319
320         # Attempt to extract SWF player URL
321         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
322         if mobj is not None:
323             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
324         else:
325             player_url = None
326
327         # Get video info
328         self.report_video_info_webpage_download(video_id)
329         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331                     % (video_id, el_type))
332             request = compat_urllib_request.Request(video_info_url)
333             try:
334                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336                 video_info = compat_parse_qs(video_info_webpage)
337                 if 'token' in video_info:
338                     break
339             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
341                 return
342         if 'token' not in video_info:
343             if 'reason' in video_info:
344                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
345             else:
346                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
347             return
348
349         # Check for "rental" videos
350         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351             self._downloader.trouble(u'ERROR: "rental" videos not supported')
352             return
353
354         # Start extracting information
355         self.report_information_extraction(video_id)
356
357         # uploader
358         if 'author' not in video_info:
359             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
360             return
361         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
362
363         # title
364         if 'title' not in video_info:
365             self._downloader.trouble(u'ERROR: unable to extract video title')
366             return
367         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
368
369         # thumbnail image
370         if 'thumbnail_url' not in video_info:
371             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
372             video_thumbnail = ''
373         else:   # don't panic if we can't find it
374             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
375
376         # upload date
377         upload_date = None
378         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
379         if mobj is not None:
380             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382             for expression in format_expressions:
383                 try:
384                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
385                 except:
386                     pass
387
388         # description
389         video_description = get_element_by_id("eow-description", video_webpage)
390         if video_description:
391             video_description = clean_html(video_description)
392         else:
393             video_description = ''
394
395         # closed captions
396         video_subtitles = None
397         if self._downloader.params.get('writesubtitles', False):
398             try:
399                 self.report_video_subtitles_download(video_id)
400                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
401                 try:
402                     srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
403                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407                 if not srt_lang_list:
408                     raise Trouble(u'WARNING: video has no closed captions')
409                 if self._downloader.params.get('subtitleslang', False):
410                     srt_lang = self._downloader.params.get('subtitleslang')
411                 elif 'en' in srt_lang_list:
412                     srt_lang = 'en'
413                 else:
414                     srt_lang = srt_lang_list.keys()[0]
415                 if not srt_lang in srt_lang_list:
416                     raise Trouble(u'WARNING: no closed captions found in the specified language')
417                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
418                 try:
419                     srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
420                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421                     raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
422                 if not srt_xml:
423                     raise Trouble(u'WARNING: unable to download video subtitles')
424                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
425             except Trouble as trouble:
426                 self._downloader.trouble(str(trouble))
427
428         if 'length_seconds' not in video_info:
429             self._downloader.trouble(u'WARNING: unable to extract video duration')
430             video_duration = ''
431         else:
432             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
433
434         # token
435         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
436
437         # Decide which formats to download
438         req_format = self._downloader.params.get('format', None)
439
440         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441             self.report_rtmp_download()
442             video_url_list = [(None, video_info['conn'][0])]
443         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446             url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
448
449             format_limit = self._downloader.params.get('format_limit', None)
450             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451             if format_limit is not None and format_limit in available_formats:
452                 format_list = available_formats[available_formats.index(format_limit):]
453             else:
454                 format_list = available_formats
455             existing_formats = [x for x in format_list if x in url_map]
456             if len(existing_formats) == 0:
457                 self._downloader.trouble(u'ERROR: no known formats available for video')
458                 return
459             if self._downloader.params.get('listformats', None):
460                 self._print_formats(existing_formats)
461                 return
462             if req_format is None or req_format == 'best':
463                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464             elif req_format == 'worst':
465                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466             elif req_format in ('-1', 'all'):
467                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
468             else:
469                 # Specific formats. We pick the first in a slash-delimeted sequence.
470                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471                 req_formats = req_format.split('/')
472                 video_url_list = None
473                 for rf in req_formats:
474                     if rf in url_map:
475                         video_url_list = [(rf, url_map[rf])]
476                         break
477                 if video_url_list is None:
478                     self._downloader.trouble(u'ERROR: requested format not available')
479                     return
480         else:
481             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
482             return
483
484         results = []
485         for format_param, video_real_url in video_url_list:
486             # Extension
487             video_extension = self._video_extensions.get(format_param, 'flv')
488
489             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490                                               self._video_dimensions.get(format_param, '???'))
491
492             results.append({
493                 'id':       video_id,
494                 'url':      video_real_url,
495                 'uploader': video_uploader,
496                 'upload_date':  upload_date,
497                 'title':    video_title,
498                 'ext':      video_extension,
499                 'format':   video_format,
500                 'thumbnail':    video_thumbnail,
501                 'description':  video_description,
502                 'player_url':   player_url,
503                 'subtitles':    video_subtitles,
504                 'duration':     video_duration
505             })
506         return results
507
508
509 class MetacafeIE(InfoExtractor):
510     """Information Extractor for metacafe.com."""
511
512     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515     IE_NAME = u'metacafe'
516
517     def __init__(self, downloader=None):
518         InfoExtractor.__init__(self, downloader)
519
520     def report_disclaimer(self):
521         """Report disclaimer retrieval."""
522         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
523
524     def report_age_confirmation(self):
525         """Report attempt to confirm age."""
526         self._downloader.to_screen(u'[metacafe] Confirming age')
527
528     def report_download_webpage(self, video_id):
529         """Report webpage download."""
530         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
531
532     def report_extraction(self, video_id):
533         """Report information extraction."""
534         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
535
536     def _real_initialize(self):
537         # Retrieve disclaimer
538         request = compat_urllib_request.Request(self._DISCLAIMER)
539         try:
540             self.report_disclaimer()
541             disclaimer = compat_urllib_request.urlopen(request).read()
542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
544             return
545
546         # Confirm age
547         disclaimer_form = {
548             'filters': '0',
549             'submit': "Continue - I'm over 18",
550             }
551         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
552         try:
553             self.report_age_confirmation()
554             disclaimer = compat_urllib_request.urlopen(request).read()
555         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
557             return
558
559     def _real_extract(self, url):
560         # Extract id and simplified title from URL
561         mobj = re.match(self._VALID_URL, url)
562         if mobj is None:
563             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
564             return
565
566         video_id = mobj.group(1)
567
568         # Check if video comes from YouTube
569         mobj2 = re.match(r'^yt-(.*)$', video_id)
570         if mobj2 is not None:
571             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
572             return
573
574         # Retrieve video webpage to extract further information
575         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
576         try:
577             self.report_download_webpage(video_id)
578             webpage = compat_urllib_request.urlopen(request).read()
579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
581             return
582
583         # Extract URL, uploader and title from webpage
584         self.report_extraction(video_id)
585         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
586         if mobj is not None:
587             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588             video_extension = mediaURL[-3:]
589
590             # Extract gdaKey if available
591             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
592             if mobj is None:
593                 video_url = mediaURL
594             else:
595                 gdaKey = mobj.group(1)
596                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
597         else:
598             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
599             if mobj is None:
600                 self._downloader.trouble(u'ERROR: unable to extract media URL')
601                 return
602             vardict = compat_parse_qs(mobj.group(1))
603             if 'mediaData' not in vardict:
604                 self._downloader.trouble(u'ERROR: unable to extract media URL')
605                 return
606             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
607             if mobj is None:
608                 self._downloader.trouble(u'ERROR: unable to extract media URL')
609                 return
610             mediaURL = mobj.group(1).replace('\\/', '/')
611             video_extension = mediaURL[-3:]
612             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
613
614         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
615         if mobj is None:
616             self._downloader.trouble(u'ERROR: unable to extract title')
617             return
618         video_title = mobj.group(1).decode('utf-8')
619
620         mobj = re.search(r'submitter=(.*?);', webpage)
621         if mobj is None:
622             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
623             return
624         video_uploader = mobj.group(1)
625
626         return [{
627             'id':       video_id.decode('utf-8'),
628             'url':      video_url.decode('utf-8'),
629             'uploader': video_uploader.decode('utf-8'),
630             'upload_date':  None,
631             'title':    video_title,
632             'ext':      video_extension.decode('utf-8'),
633         }]
634
635
636 class DailymotionIE(InfoExtractor):
637     """Information Extractor for Dailymotion"""
638
639     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640     IE_NAME = u'dailymotion'
641
642     def __init__(self, downloader=None):
643         InfoExtractor.__init__(self, downloader)
644
645     def report_download_webpage(self, video_id):
646         """Report webpage download."""
647         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
648
649     def report_extraction(self, video_id):
650         """Report information extraction."""
651         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
652
653     def _real_extract(self, url):
654         # Extract id and simplified title from URL
655         mobj = re.match(self._VALID_URL, url)
656         if mobj is None:
657             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
658             return
659
660         video_id = mobj.group(1).split('_')[0].split('?')[0]
661
662         video_extension = 'mp4'
663
664         # Retrieve video webpage to extract further information
665         request = compat_urllib_request.Request(url)
666         request.add_header('Cookie', 'family_filter=off')
667         try:
668             self.report_download_webpage(video_id)
669             webpage_bytes = compat_urllib_request.urlopen(request).read()
670             webpage = webpage_bytes.decode('utf-8')
671         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
673             return
674
675         # Extract URL, uploader and title from webpage
676         self.report_extraction(video_id)
677         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
678         if mobj is None:
679             self._downloader.trouble(u'ERROR: unable to extract media URL')
680             return
681         flashvars = compat_urllib_parse.unquote(mobj.group(1))
682
683         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
684             if key in flashvars:
685                 max_quality = key
686                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
687                 break
688         else:
689             self._downloader.trouble(u'ERROR: unable to extract video URL')
690             return
691
692         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
693         if mobj is None:
694             self._downloader.trouble(u'ERROR: unable to extract video URL')
695             return
696
697         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
698
699         # TODO: support choosing qualities
700
701         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
702         if mobj is None:
703             self._downloader.trouble(u'ERROR: unable to extract title')
704             return
705         video_title = unescapeHTML(mobj.group('title'))
706
707         video_uploader = None
708         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
709         if mobj is None:
710             # lookin for official user
711             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712             if mobj_official is None:
713                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
714             else:
715                 video_uploader = mobj_official.group(1)
716         else:
717             video_uploader = mobj.group(1)
718
719         video_upload_date = None
720         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
721         if mobj is not None:
722             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
723
724         return [{
725             'id':       video_id,
726             'url':      video_url,
727             'uploader': video_uploader,
728             'upload_date':  video_upload_date,
729             'title':    video_title,
730             'ext':      video_extension,
731         }]
732
733
734 class GoogleIE(InfoExtractor):
735     """Information extractor for video.google.com."""
736
737     _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
738     IE_NAME = u'video.google'
739
740     def __init__(self, downloader=None):
741         InfoExtractor.__init__(self, downloader)
742
743     def report_download_webpage(self, video_id):
744         """Report webpage download."""
745         self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
746
747     def report_extraction(self, video_id):
748         """Report information extraction."""
749         self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
750
751     def _real_extract(self, url):
752         # Extract id from URL
753         mobj = re.match(self._VALID_URL, url)
754         if mobj is None:
755             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
756             return
757
758         video_id = mobj.group(1)
759
760         video_extension = 'mp4'
761
762         # Retrieve video webpage to extract further information
763         request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
764         try:
765             self.report_download_webpage(video_id)
766             webpage = compat_urllib_request.urlopen(request).read()
767         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
769             return
770
771         # Extract URL, uploader, and title from webpage
772         self.report_extraction(video_id)
773         mobj = re.search(r"download_url:'([^']+)'", webpage)
774         if mobj is None:
775             video_extension = 'flv'
776             mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
777         if mobj is None:
778             self._downloader.trouble(u'ERROR: unable to extract media URL')
779             return
780         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781         mediaURL = mediaURL.replace('\\x3d', '\x3d')
782         mediaURL = mediaURL.replace('\\x26', '\x26')
783
784         video_url = mediaURL
785
786         mobj = re.search(r'<title>(.*)</title>', webpage)
787         if mobj is None:
788             self._downloader.trouble(u'ERROR: unable to extract title')
789             return
790         video_title = mobj.group(1).decode('utf-8')
791
792         # Extract video description
793         mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
794         if mobj is None:
795             self._downloader.trouble(u'ERROR: unable to extract video description')
796             return
797         video_description = mobj.group(1).decode('utf-8')
798         if not video_description:
799             video_description = 'No description available.'
800
801         # Extract video thumbnail
802         if self._downloader.params.get('forcethumbnail', False):
803             request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
804             try:
805                 webpage = compat_urllib_request.urlopen(request).read()
806             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
808                 return
809             mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
810             if mobj is None:
811                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
812                 return
813             video_thumbnail = mobj.group(1)
814         else:   # we need something to pass to process_info
815             video_thumbnail = ''
816
817         return [{
818             'id':       video_id.decode('utf-8'),
819             'url':      video_url.decode('utf-8'),
820             'uploader': None,
821             'upload_date':  None,
822             'title':    video_title,
823             'ext':      video_extension.decode('utf-8'),
824         }]
825
826
827 class PhotobucketIE(InfoExtractor):
828     """Information extractor for photobucket.com."""
829
830     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831     IE_NAME = u'photobucket'
832
833     def __init__(self, downloader=None):
834         InfoExtractor.__init__(self, downloader)
835
836     def report_download_webpage(self, video_id):
837         """Report webpage download."""
838         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
839
840     def report_extraction(self, video_id):
841         """Report information extraction."""
842         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
843
844     def _real_extract(self, url):
845         # Extract id from URL
846         mobj = re.match(self._VALID_URL, url)
847         if mobj is None:
848             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
849             return
850
851         video_id = mobj.group(1)
852
853         video_extension = 'flv'
854
855         # Retrieve video webpage to extract further information
856         request = compat_urllib_request.Request(url)
857         try:
858             self.report_download_webpage(video_id)
859             webpage = compat_urllib_request.urlopen(request).read()
860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
862             return
863
864         # Extract URL, uploader, and title from webpage
865         self.report_extraction(video_id)
866         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
867         if mobj is None:
868             self._downloader.trouble(u'ERROR: unable to extract media URL')
869             return
870         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
871
872         video_url = mediaURL
873
874         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
875         if mobj is None:
876             self._downloader.trouble(u'ERROR: unable to extract title')
877             return
878         video_title = mobj.group(1).decode('utf-8')
879
880         video_uploader = mobj.group(2).decode('utf-8')
881
882         return [{
883             'id':       video_id.decode('utf-8'),
884             'url':      video_url.decode('utf-8'),
885             'uploader': video_uploader,
886             'upload_date':  None,
887             'title':    video_title,
888             'ext':      video_extension.decode('utf-8'),
889         }]
890
891
892 class YahooIE(InfoExtractor):
893     """Information extractor for video.yahoo.com."""
894
895     # _VALID_URL matches all Yahoo! Video URLs
896     # _VPAGE_URL matches only the extractable '/watch/' URLs
897     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
898     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
899     IE_NAME = u'video.yahoo'
900
901     def __init__(self, downloader=None):
902         InfoExtractor.__init__(self, downloader)
903
904     def report_download_webpage(self, video_id):
905         """Report webpage download."""
906         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
907
908     def report_extraction(self, video_id):
909         """Report information extraction."""
910         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
911
912     def _real_extract(self, url, new_video=True):
913         # Extract ID from URL
914         mobj = re.match(self._VALID_URL, url)
915         if mobj is None:
916             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
917             return
918
919         video_id = mobj.group(2)
920         video_extension = 'flv'
921
922         # Rewrite valid but non-extractable URLs as
923         # extractable English language /watch/ URLs
924         if re.match(self._VPAGE_URL, url) is None:
925             request = compat_urllib_request.Request(url)
926             try:
927                 webpage = compat_urllib_request.urlopen(request).read()
928             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
929                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
930                 return
931
932             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
933             if mobj is None:
934                 self._downloader.trouble(u'ERROR: Unable to extract id field')
935                 return
936             yahoo_id = mobj.group(1)
937
938             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
939             if mobj is None:
940                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
941                 return
942             yahoo_vid = mobj.group(1)
943
944             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
945             return self._real_extract(url, new_video=False)
946
947         # Retrieve video webpage to extract further information
948         request = compat_urllib_request.Request(url)
949         try:
950             self.report_download_webpage(video_id)
951             webpage = compat_urllib_request.urlopen(request).read()
952         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
953             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
954             return
955
956         # Extract uploader and title from webpage
957         self.report_extraction(video_id)
958         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
959         if mobj is None:
960             self._downloader.trouble(u'ERROR: unable to extract video title')
961             return
962         video_title = mobj.group(1).decode('utf-8')
963
964         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
965         if mobj is None:
966             self._downloader.trouble(u'ERROR: unable to extract video uploader')
967             return
968         video_uploader = mobj.group(1).decode('utf-8')
969
970         # Extract video thumbnail
971         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
972         if mobj is None:
973             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
974             return
975         video_thumbnail = mobj.group(1).decode('utf-8')
976
977         # Extract video description
978         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
979         if mobj is None:
980             self._downloader.trouble(u'ERROR: unable to extract video description')
981             return
982         video_description = mobj.group(1).decode('utf-8')
983         if not video_description:
984             video_description = 'No description available.'
985
986         # Extract video height and width
987         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
988         if mobj is None:
989             self._downloader.trouble(u'ERROR: unable to extract video height')
990             return
991         yv_video_height = mobj.group(1)
992
993         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
994         if mobj is None:
995             self._downloader.trouble(u'ERROR: unable to extract video width')
996             return
997         yv_video_width = mobj.group(1)
998
999         # Retrieve video playlist to extract media URL
1000         # I'm not completely sure what all these options are, but we
1001         # seem to need most of them, otherwise the server sends a 401.
1002         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1003         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1004         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1005                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1006                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1007         try:
1008             self.report_download_webpage(video_id)
1009             webpage = compat_urllib_request.urlopen(request).read()
1010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1012             return
1013
1014         # Extract media URL from playlist XML
1015         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1016         if mobj is None:
1017             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1018             return
1019         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1020         video_url = unescapeHTML(video_url)
1021
1022         return [{
1023             'id':       video_id.decode('utf-8'),
1024             'url':      video_url,
1025             'uploader': video_uploader,
1026             'upload_date':  None,
1027             'title':    video_title,
1028             'ext':      video_extension.decode('utf-8'),
1029             'thumbnail':    video_thumbnail.decode('utf-8'),
1030             'description':  video_description,
1031         }]
1032
1033
1034 class VimeoIE(InfoExtractor):
1035     """Information extractor for vimeo.com."""
1036
1037     # _VALID_URL matches Vimeo URLs
1038     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1039     IE_NAME = u'vimeo'
1040
1041     def __init__(self, downloader=None):
1042         InfoExtractor.__init__(self, downloader)
1043
1044     def report_download_webpage(self, video_id):
1045         """Report webpage download."""
1046         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1047
1048     def report_extraction(self, video_id):
1049         """Report information extraction."""
1050         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1051
1052     def _real_extract(self, url, new_video=True):
1053         # Extract ID from URL
1054         mobj = re.match(self._VALID_URL, url)
1055         if mobj is None:
1056             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1057             return
1058
1059         video_id = mobj.group(1)
1060
1061         # Retrieve video webpage to extract further information
1062         request = compat_urllib_request.Request(url, None, std_headers)
1063         try:
1064             self.report_download_webpage(video_id)
1065             webpage_bytes = compat_urllib_request.urlopen(request).read()
1066             webpage = webpage_bytes.decode('utf-8')
1067         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1068             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1069             return
1070
1071         # Now we begin extracting as much information as we can from what we
1072         # retrieved. First we extract the information common to all extractors,
1073         # and latter we extract those that are Vimeo specific.
1074         self.report_extraction(video_id)
1075
1076         # Extract the config JSON
1077         try:
1078             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1079             config = json.loads(config)
1080         except:
1081             self._downloader.trouble(u'ERROR: unable to extract info section')
1082             return
1083         
1084         # Extract title
1085         video_title = config["video"]["title"]
1086
1087         # Extract uploader
1088         video_uploader = config["video"]["owner"]["name"]
1089
1090         # Extract video thumbnail
1091         video_thumbnail = config["video"]["thumbnail"]
1092
1093         # Extract video description
1094         video_description = get_element_by_id("description", webpage)
1095         if video_description: video_description = clean_html(video_description)
1096         else: video_description = ''
1097
1098         # Extract upload date
1099         video_upload_date = None
1100         mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1101         if mobj is not None:
1102             video_upload_date = mobj.group(1)
1103
1104         # Vimeo specific: extract request signature and timestamp
1105         sig = config['request']['signature']
1106         timestamp = config['request']['timestamp']
1107
1108         # Vimeo specific: extract video codec and quality information
1109         # First consider quality, then codecs, then take everything
1110         # TODO bind to format param
1111         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1112         files = { 'hd': [], 'sd': [], 'other': []}
1113         for codec_name, codec_extension in codecs:
1114             if codec_name in config["video"]["files"]:
1115                 if 'hd' in config["video"]["files"][codec_name]:
1116                     files['hd'].append((codec_name, codec_extension, 'hd'))
1117                 elif 'sd' in config["video"]["files"][codec_name]:
1118                     files['sd'].append((codec_name, codec_extension, 'sd'))
1119                 else:
1120                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1121
1122         for quality in ('hd', 'sd', 'other'):
1123             if len(files[quality]) > 0:
1124                 video_quality = files[quality][0][2]
1125                 video_codec = files[quality][0][0]
1126                 video_extension = files[quality][0][1]
1127                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1128                 break
1129         else:
1130             self._downloader.trouble(u'ERROR: no known codec found')
1131             return
1132
1133         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1134                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1135
1136         return [{
1137             'id':       video_id,
1138             'url':      video_url,
1139             'uploader': video_uploader,
1140             'upload_date':  video_upload_date,
1141             'title':    video_title,
1142             'ext':      video_extension,
1143             'thumbnail':    video_thumbnail,
1144             'description':  video_description,
1145         }]
1146
1147
1148 class ArteTvIE(InfoExtractor):
1149     """arte.tv information extractor."""
1150
1151     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1152     _LIVE_URL = r'index-[0-9]+\.html$'
1153
1154     IE_NAME = u'arte.tv'
1155
1156     def __init__(self, downloader=None):
1157         InfoExtractor.__init__(self, downloader)
1158
1159     def report_download_webpage(self, video_id):
1160         """Report webpage download."""
1161         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1162
1163     def report_extraction(self, video_id):
1164         """Report information extraction."""
1165         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1166
1167     def fetch_webpage(self, url):
1168         self._downloader.increment_downloads()
1169         request = compat_urllib_request.Request(url)
1170         try:
1171             self.report_download_webpage(url)
1172             webpage = compat_urllib_request.urlopen(request).read()
1173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1175             return
1176         except ValueError as err:
1177             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178             return
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188             return
1189
1190         for (i, key, err) in matchTuples:
1191             if mobj.group(i) is None:
1192                 self._downloader.trouble(err)
1193                 return
1194             else:
1195                 info[key] = mobj.group(i)
1196
1197         return info
1198
1199     def extractLiveStream(self, url):
1200         video_lang = url.split('/')[-4]
1201         info = self.grep_webpage(
1202             url,
1203             r'src="(.*?/videothek_js.*?\.js)',
1204             0,
1205             [
1206                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1207             ]
1208         )
1209         http_host = url.split('/')[2]
1210         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211         info = self.grep_webpage(
1212             next_url,
1213             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214                 '(http://.*?\.swf).*?' +
1215                 '(rtmp://.*?)\'',
1216             re.DOTALL,
1217             [
1218                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1219                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1220                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1221             ]
1222         )
1223         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224
1225     def extractPlus7Stream(self, url):
1226         video_lang = url.split('/')[-3]
1227         info = self.grep_webpage(
1228             url,
1229             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230             0,
1231             [
1232                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1233             ]
1234         )
1235         next_url = compat_urllib_parse.unquote(info.get('url'))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239             0,
1240             [
1241                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1242             ]
1243         )
1244         next_url = compat_urllib_parse.unquote(info.get('url'))
1245
1246         info = self.grep_webpage(
1247             next_url,
1248             r'<video id="(.*?)".*?>.*?' +
1249                 '<name>(.*?)</name>.*?' +
1250                 '<dateVideo>(.*?)</dateVideo>.*?' +
1251                 '<url quality="hd">(.*?)</url>',
1252             re.DOTALL,
1253             [
1254                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1255                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1256                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1257                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1258             ]
1259         )
1260
1261         return {
1262             'id':           info.get('id'),
1263             'url':          compat_urllib_parse.unquote(info.get('url')),
1264             'uploader':     u'arte.tv',
1265             'upload_date':  info.get('date'),
1266             'title':        info.get('title'),
1267             'ext':          u'mp4',
1268             'format':       u'NA',
1269             'player_url':   None,
1270         }
1271
1272     def _real_extract(self, url):
1273         video_id = url.split('/')[-1]
1274         self.report_extraction(video_id)
1275
1276         if re.search(self._LIVE_URL, video_id) is not None:
1277             self.extractLiveStream(url)
1278             return
1279         else:
1280             info = self.extractPlus7Stream(url)
1281
1282         return [info]
1283
1284
1285 class GenericIE(InfoExtractor):
1286     """Generic last-resort information extractor."""
1287
1288     _VALID_URL = r'.*'
1289     IE_NAME = u'generic'
1290
1291     def __init__(self, downloader=None):
1292         InfoExtractor.__init__(self, downloader)
1293
1294     def report_download_webpage(self, video_id):
1295         """Report webpage download."""
1296         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1297         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1298
1299     def report_extraction(self, video_id):
1300         """Report information extraction."""
1301         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1302
1303     def report_following_redirect(self, new_url):
1304         """Report information extraction."""
1305         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1306         
1307     def _test_redirect(self, url):
1308         """Check if it is a redirect, like url shorteners, in case restart chain."""
1309         class HeadRequest(compat_urllib_request.Request):
1310             def get_method(self):
1311                 return "HEAD"
1312
1313         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1314             """
1315             Subclass the HTTPRedirectHandler to make it use our 
1316             HeadRequest also on the redirected URL
1317             """
1318             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1319                 if code in (301, 302, 303, 307):
1320                     newurl = newurl.replace(' ', '%20') 
1321                     newheaders = dict((k,v) for k,v in req.headers.items()
1322                                       if k.lower() not in ("content-length", "content-type"))
1323                     return HeadRequest(newurl, 
1324                                        headers=newheaders,
1325                                        origin_req_host=req.get_origin_req_host(), 
1326                                        unverifiable=True) 
1327                 else: 
1328                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1329
1330         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1331             """
1332             Fallback to GET if HEAD is not allowed (405 HTTP error)
1333             """
1334             def http_error_405(self, req, fp, code, msg, headers): 
1335                 fp.read()
1336                 fp.close()
1337
1338                 newheaders = dict((k,v) for k,v in req.headers.items()
1339                                   if k.lower() not in ("content-length", "content-type"))
1340                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1341                                                  headers=newheaders, 
1342                                                  origin_req_host=req.get_origin_req_host(), 
1343                                                  unverifiable=True))
1344
1345         # Build our opener
1346         opener = compat_urllib_request.OpenerDirector() 
1347         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1348                         HTTPMethodFallback, HEADRedirectHandler,
1349                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1350             opener.add_handler(handler())
1351
1352         response = opener.open(HeadRequest(url))
1353         new_url = response.geturl()
1354
1355         if url == new_url:
1356             return False
1357
1358         self.report_following_redirect(new_url)
1359         self._downloader.download([new_url])
1360         return True
1361
1362     def _real_extract(self, url):
1363         if self._test_redirect(url): return
1364
1365         video_id = url.split('/')[-1]
1366         request = compat_urllib_request.Request(url)
1367         try:
1368             self.report_download_webpage(video_id)
1369             webpage = compat_urllib_request.urlopen(request).read()
1370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1371             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1372             return
1373         except ValueError as err:
1374             # since this is the last-resort InfoExtractor, if
1375             # this error is thrown, it'll be thrown here
1376             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377             return
1378
1379         self.report_extraction(video_id)
1380         # Start with something easy: JW Player in SWFObject
1381         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1382         if mobj is None:
1383             # Broaden the search a little bit
1384             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1385         if mobj is None:
1386             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387             return
1388
1389         # It's possible that one of the regexes
1390         # matched, but returned an empty group:
1391         if mobj.group(1) is None:
1392             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1393             return
1394
1395         video_url = compat_urllib_parse.unquote(mobj.group(1))
1396         video_id = os.path.basename(video_url)
1397
1398         # here's a fun little line of code for you:
1399         video_extension = os.path.splitext(video_id)[1][1:]
1400         video_id = os.path.splitext(video_id)[0]
1401
1402         # it's tempting to parse this further, but you would
1403         # have to take into account all the variations like
1404         #   Video Title - Site Name
1405         #   Site Name | Video Title
1406         #   Video Title - Tagline | Site Name
1407         # and so on and so forth; it's just not practical
1408         mobj = re.search(r'<title>(.*)</title>', webpage)
1409         if mobj is None:
1410             self._downloader.trouble(u'ERROR: unable to extract title')
1411             return
1412         video_title = mobj.group(1)
1413
1414         # video uploader is domain name
1415         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416         if mobj is None:
1417             self._downloader.trouble(u'ERROR: unable to extract title')
1418             return
1419         video_uploader = mobj.group(1)
1420
1421         return [{
1422             'id':       video_id,
1423             'url':      video_url,
1424             'uploader': video_uploader,
1425             'upload_date':  None,
1426             'title':    video_title,
1427             'ext':      video_extension,
1428         }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432     """Information Extractor for YouTube search queries."""
1433     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435     _max_youtube_results = 1000
1436     IE_NAME = u'youtube:search'
1437
1438     def __init__(self, downloader=None):
1439         InfoExtractor.__init__(self, downloader)
1440
1441     def report_download_page(self, query, pagenum):
1442         """Report attempt to download search page with given number."""
1443         query = query.decode(preferredencoding())
1444         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1445
1446     def _real_extract(self, query):
1447         mobj = re.match(self._VALID_URL, query)
1448         if mobj is None:
1449             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1450             return
1451
1452         prefix, query = query.split(':')
1453         prefix = prefix[8:]
1454         query = query.encode('utf-8')
1455         if prefix == '':
1456             self._download_n_results(query, 1)
1457             return
1458         elif prefix == 'all':
1459             self._download_n_results(query, self._max_youtube_results)
1460             return
1461         else:
1462             try:
1463                 n = int(prefix)
1464                 if n <= 0:
1465                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1466                     return
1467                 elif n > self._max_youtube_results:
1468                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1469                     n = self._max_youtube_results
1470                 self._download_n_results(query, n)
1471                 return
1472             except ValueError: # parsing prefix as integer fails
1473                 self._download_n_results(query, 1)
1474                 return
1475
1476     def _download_n_results(self, query, n):
1477         """Downloads a specified number of results for a query"""
1478
1479         video_ids = []
1480         pagenum = 0
1481         limit = n
1482
1483         while (50 * pagenum) < limit:
1484             self.report_download_page(query, pagenum+1)
1485             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1486             request = compat_urllib_request.Request(result_url)
1487             try:
1488                 data = compat_urllib_request.urlopen(request).read()
1489             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1490                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1491                 return
1492             api_response = json.loads(data)['data']
1493
1494             new_ids = list(video['id'] for video in api_response['items'])
1495             video_ids += new_ids
1496
1497             limit = min(n, api_response['totalItems'])
1498             pagenum += 1
1499
1500         if len(video_ids) > n:
1501             video_ids = video_ids[:n]
1502         for id in video_ids:
1503             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1504         return
1505
1506
1507 class GoogleSearchIE(InfoExtractor):
1508     """Information Extractor for Google Video search queries."""
1509     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1510     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1511     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1512     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1513     _max_google_results = 1000
1514     IE_NAME = u'video.google:search'
1515
1516     def __init__(self, downloader=None):
1517         InfoExtractor.__init__(self, downloader)
1518
1519     def report_download_page(self, query, pagenum):
1520         """Report attempt to download playlist page with given number."""
1521         query = query.decode(preferredencoding())
1522         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1523
1524     def _real_extract(self, query):
1525         mobj = re.match(self._VALID_URL, query)
1526         if mobj is None:
1527             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1528             return
1529
1530         prefix, query = query.split(':')
1531         prefix = prefix[8:]
1532         query = query.encode('utf-8')
1533         if prefix == '':
1534             self._download_n_results(query, 1)
1535             return
1536         elif prefix == 'all':
1537             self._download_n_results(query, self._max_google_results)
1538             return
1539         else:
1540             try:
1541                 n = int(prefix)
1542                 if n <= 0:
1543                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1544                     return
1545                 elif n > self._max_google_results:
1546                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1547                     n = self._max_google_results
1548                 self._download_n_results(query, n)
1549                 return
1550             except ValueError: # parsing prefix as integer fails
1551                 self._download_n_results(query, 1)
1552                 return
1553
1554     def _download_n_results(self, query, n):
1555         """Downloads a specified number of results for a query"""
1556
1557         video_ids = []
1558         pagenum = 0
1559
1560         while True:
1561             self.report_download_page(query, pagenum)
1562             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1563             request = compat_urllib_request.Request(result_url)
1564             try:
1565                 page = compat_urllib_request.urlopen(request).read()
1566             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1567                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1568                 return
1569
1570             # Extract video identifiers
1571             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1572                 video_id = mobj.group(1)
1573                 if video_id not in video_ids:
1574                     video_ids.append(video_id)
1575                     if len(video_ids) == n:
1576                         # Specified n videos reached
1577                         for id in video_ids:
1578                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579                         return
1580
1581             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582                 for id in video_ids:
1583                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1584                 return
1585
1586             pagenum = pagenum + 1
1587
1588
1589 class YahooSearchIE(InfoExtractor):
1590     """Information Extractor for Yahoo! Video search queries."""
1591     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1592     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1593     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1594     _MORE_PAGES_INDICATOR = r'\s*Next'
1595     _max_yahoo_results = 1000
1596     IE_NAME = u'video.yahoo:search'
1597
1598     def __init__(self, downloader=None):
1599         InfoExtractor.__init__(self, downloader)
1600
1601     def report_download_page(self, query, pagenum):
1602         """Report attempt to download playlist page with given number."""
1603         query = query.decode(preferredencoding())
1604         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1605
1606     def _real_extract(self, query):
1607         mobj = re.match(self._VALID_URL, query)
1608         if mobj is None:
1609             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1610             return
1611
1612         prefix, query = query.split(':')
1613         prefix = prefix[8:]
1614         query = query.encode('utf-8')
1615         if prefix == '':
1616             self._download_n_results(query, 1)
1617             return
1618         elif prefix == 'all':
1619             self._download_n_results(query, self._max_yahoo_results)
1620             return
1621         else:
1622             try:
1623                 n = int(prefix)
1624                 if n <= 0:
1625                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1626                     return
1627                 elif n > self._max_yahoo_results:
1628                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1629                     n = self._max_yahoo_results
1630                 self._download_n_results(query, n)
1631                 return
1632             except ValueError: # parsing prefix as integer fails
1633                 self._download_n_results(query, 1)
1634                 return
1635
1636     def _download_n_results(self, query, n):
1637         """Downloads a specified number of results for a query"""
1638
1639         video_ids = []
1640         already_seen = set()
1641         pagenum = 1
1642
1643         while True:
1644             self.report_download_page(query, pagenum)
1645             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1646             request = compat_urllib_request.Request(result_url)
1647             try:
1648                 page = compat_urllib_request.urlopen(request).read()
1649             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1650                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1651                 return
1652
1653             # Extract video identifiers
1654             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1655                 video_id = mobj.group(1)
1656                 if video_id not in already_seen:
1657                     video_ids.append(video_id)
1658                     already_seen.add(video_id)
1659                     if len(video_ids) == n:
1660                         # Specified n videos reached
1661                         for id in video_ids:
1662                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1663                         return
1664
1665             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1666                 for id in video_ids:
1667                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1668                 return
1669
1670             pagenum = pagenum + 1
1671
1672
1673 class YoutubePlaylistIE(InfoExtractor):
1674     """Information Extractor for YouTube playlists."""
1675
1676     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1677     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1678     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1679     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1680     IE_NAME = u'youtube:playlist'
1681
1682     def __init__(self, downloader=None):
1683         InfoExtractor.__init__(self, downloader)
1684
1685     def report_download_page(self, playlist_id, pagenum):
1686         """Report attempt to download playlist page with given number."""
1687         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1688
1689     def _real_extract(self, url):
1690         # Extract playlist id
1691         mobj = re.match(self._VALID_URL, url)
1692         if mobj is None:
1693             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1694             return
1695
1696         # Single video case
1697         if mobj.group(3) is not None:
1698             self._downloader.download([mobj.group(3)])
1699             return
1700
1701         # Download playlist pages
1702         # prefix is 'p' as default for playlists but there are other types that need extra care
1703         playlist_prefix = mobj.group(1)
1704         if playlist_prefix == 'a':
1705             playlist_access = 'artist'
1706         else:
1707             playlist_prefix = 'p'
1708             playlist_access = 'view_play_list'
1709         playlist_id = mobj.group(2)
1710         video_ids = []
1711         pagenum = 1
1712
1713         while True:
1714             self.report_download_page(playlist_id, pagenum)
1715             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1716             request = compat_urllib_request.Request(url)
1717             try:
1718                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1719             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721                 return
1722
1723             # Extract video identifiers
1724             ids_in_page = []
1725             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1726                 if mobj.group(1) not in ids_in_page:
1727                     ids_in_page.append(mobj.group(1))
1728             video_ids.extend(ids_in_page)
1729
1730             if self._MORE_PAGES_INDICATOR not in page:
1731                 break
1732             pagenum = pagenum + 1
1733
1734         total = len(video_ids)
1735
1736         playliststart = self._downloader.params.get('playliststart', 1) - 1
1737         playlistend = self._downloader.params.get('playlistend', -1)
1738         if playlistend == -1:
1739             video_ids = video_ids[playliststart:]
1740         else:
1741             video_ids = video_ids[playliststart:playlistend]
1742
1743         if len(video_ids) == total:
1744             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1745         else:
1746             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1747
1748         for id in video_ids:
1749             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1750         return
1751
1752
1753 class YoutubeChannelIE(InfoExtractor):
1754     """Information Extractor for YouTube channels."""
1755
1756     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1757     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1758     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1759     IE_NAME = u'youtube:channel'
1760
1761     def report_download_page(self, channel_id, pagenum):
1762         """Report attempt to download channel page with given number."""
1763         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1764
1765     def _real_extract(self, url):
1766         # Extract channel id
1767         mobj = re.match(self._VALID_URL, url)
1768         if mobj is None:
1769             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1770             return
1771
1772         # Download channel pages
1773         channel_id = mobj.group(1)
1774         video_ids = []
1775         pagenum = 1
1776
1777         while True:
1778             self.report_download_page(channel_id, pagenum)
1779             url = self._TEMPLATE_URL % (channel_id, pagenum)
1780             request = compat_urllib_request.Request(url)
1781             try:
1782                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1783             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1784                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1785                 return
1786
1787             # Extract video identifiers
1788             ids_in_page = []
1789             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1790                 if mobj.group(1) not in ids_in_page:
1791                     ids_in_page.append(mobj.group(1))
1792             video_ids.extend(ids_in_page)
1793
1794             if self._MORE_PAGES_INDICATOR not in page:
1795                 break
1796             pagenum = pagenum + 1
1797
1798         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1799
1800         for id in video_ids:
1801             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1802         return
1803
1804
1805 class YoutubeUserIE(InfoExtractor):
1806     """Information Extractor for YouTube users."""
1807
1808     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1809     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1810     _GDATA_PAGE_SIZE = 50
1811     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1812     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1813     IE_NAME = u'youtube:user'
1814
1815     def __init__(self, downloader=None):
1816         InfoExtractor.__init__(self, downloader)
1817
1818     def report_download_page(self, username, start_index):
1819         """Report attempt to download user page."""
1820         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1821                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1822
1823     def _real_extract(self, url):
1824         # Extract username
1825         mobj = re.match(self._VALID_URL, url)
1826         if mobj is None:
1827             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1828             return
1829
1830         username = mobj.group(1)
1831
1832         # Download video ids using YouTube Data API. Result size per
1833         # query is limited (currently to 50 videos) so we need to query
1834         # page by page until there are no video ids - it means we got
1835         # all of them.
1836
1837         video_ids = []
1838         pagenum = 0
1839
1840         while True:
1841             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1842             self.report_download_page(username, start_index)
1843
1844             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1845
1846             try:
1847                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1848             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1849                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1850                 return
1851
1852             # Extract video identifiers
1853             ids_in_page = []
1854
1855             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1856                 if mobj.group(1) not in ids_in_page:
1857                     ids_in_page.append(mobj.group(1))
1858
1859             video_ids.extend(ids_in_page)
1860
1861             # A little optimization - if current page is not
1862             # "full", ie. does not contain PAGE_SIZE video ids then
1863             # we can assume that this page is the last one - there
1864             # are no more ids on further pages - no need to query
1865             # again.
1866
1867             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1868                 break
1869
1870             pagenum += 1
1871
1872         all_ids_count = len(video_ids)
1873         playliststart = self._downloader.params.get('playliststart', 1) - 1
1874         playlistend = self._downloader.params.get('playlistend', -1)
1875
1876         if playlistend == -1:
1877             video_ids = video_ids[playliststart:]
1878         else:
1879             video_ids = video_ids[playliststart:playlistend]
1880
1881         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1882                 (username, all_ids_count, len(video_ids)))
1883
1884         for video_id in video_ids:
1885             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1886
1887
1888 class BlipTVUserIE(InfoExtractor):
1889     """Information Extractor for blip.tv users."""
1890
1891     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1892     _PAGE_SIZE = 12
1893     IE_NAME = u'blip.tv:user'
1894
1895     def __init__(self, downloader=None):
1896         InfoExtractor.__init__(self, downloader)
1897
1898     def report_download_page(self, username, pagenum):
1899         """Report attempt to download user page."""
1900         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1901                 (self.IE_NAME, username, pagenum))
1902
1903     def _real_extract(self, url):
1904         # Extract username
1905         mobj = re.match(self._VALID_URL, url)
1906         if mobj is None:
1907             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1908             return
1909
1910         username = mobj.group(1)
1911
1912         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1913
1914         request = compat_urllib_request.Request(url)
1915
1916         try:
1917             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918             mobj = re.search(r'data-users-id="([^"]+)"', page)
1919             page_base = page_base % mobj.group(1)
1920         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1921             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1922             return
1923
1924
1925         # Download video ids using BlipTV Ajax calls. Result size per
1926         # query is limited (currently to 12 videos) so we need to query
1927         # page by page until there are no video ids - it means we got
1928         # all of them.
1929
1930         video_ids = []
1931         pagenum = 1
1932
1933         while True:
1934             self.report_download_page(username, pagenum)
1935
1936             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1937
1938             try:
1939                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1940             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1942                 return
1943
1944             # Extract video identifiers
1945             ids_in_page = []
1946
1947             for mobj in re.finditer(r'href="/([^"]+)"', page):
1948                 if mobj.group(1) not in ids_in_page:
1949                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1950
1951             video_ids.extend(ids_in_page)
1952
1953             # A little optimization - if current page is not
1954             # "full", ie. does not contain PAGE_SIZE video ids then
1955             # we can assume that this page is the last one - there
1956             # are no more ids on further pages - no need to query
1957             # again.
1958
1959             if len(ids_in_page) < self._PAGE_SIZE:
1960                 break
1961
1962             pagenum += 1
1963
1964         all_ids_count = len(video_ids)
1965         playliststart = self._downloader.params.get('playliststart', 1) - 1
1966         playlistend = self._downloader.params.get('playlistend', -1)
1967
1968         if playlistend == -1:
1969             video_ids = video_ids[playliststart:]
1970         else:
1971             video_ids = video_ids[playliststart:playlistend]
1972
1973         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1974                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1975
1976         for video_id in video_ids:
1977             self._downloader.download([u'http://blip.tv/'+video_id])
1978
1979
1980 class DepositFilesIE(InfoExtractor):
1981     """Information extractor for depositfiles.com"""
1982
1983     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1984     IE_NAME = u'DepositFiles'
1985
1986     def __init__(self, downloader=None):
1987         InfoExtractor.__init__(self, downloader)
1988
1989     def report_download_webpage(self, file_id):
1990         """Report webpage download."""
1991         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1992
1993     def report_extraction(self, file_id):
1994         """Report information extraction."""
1995         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1996
1997     def _real_extract(self, url):
1998         file_id = url.split('/')[-1]
1999         # Rebuild url in english locale
2000         url = 'http://depositfiles.com/en/files/' + file_id
2001
2002         # Retrieve file webpage with 'Free download' button pressed
2003         free_download_indication = { 'gateway_result' : '1' }
2004         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2005         try:
2006             self.report_download_webpage(file_id)
2007             webpage = compat_urllib_request.urlopen(request).read()
2008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2009             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2010             return
2011
2012         # Search for the real file URL
2013         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2014         if (mobj is None) or (mobj.group(1) is None):
2015             # Try to figure out reason of the error.
2016             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2017             if (mobj is not None) and (mobj.group(1) is not None):
2018                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2019                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2020             else:
2021                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2022             return
2023
2024         file_url = mobj.group(1)
2025         file_extension = os.path.splitext(file_url)[1][1:]
2026
2027         # Search for file title
2028         mobj = re.search(r'<b title="(.*?)">', webpage)
2029         if mobj is None:
2030             self._downloader.trouble(u'ERROR: unable to extract title')
2031             return
2032         file_title = mobj.group(1).decode('utf-8')
2033
2034         return [{
2035             'id':       file_id.decode('utf-8'),
2036             'url':      file_url.decode('utf-8'),
2037             'uploader': None,
2038             'upload_date':  None,
2039             'title':    file_title,
2040             'ext':      file_extension.decode('utf-8'),
2041         }]
2042
2043
2044 class FacebookIE(InfoExtractor):
2045     """Information Extractor for Facebook"""
2046
2047     _WORKING = False
2048     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2049     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2050     _NETRC_MACHINE = 'facebook'
2051     _available_formats = ['video', 'highqual', 'lowqual']
2052     _video_extensions = {
2053         'video': 'mp4',
2054         'highqual': 'mp4',
2055         'lowqual': 'mp4',
2056     }
2057     IE_NAME = u'facebook'
2058
2059     def __init__(self, downloader=None):
2060         InfoExtractor.__init__(self, downloader)
2061
2062     def _reporter(self, message):
2063         """Add header and report message."""
2064         self._downloader.to_screen(u'[facebook] %s' % message)
2065
2066     def report_login(self):
2067         """Report attempt to log in."""
2068         self._reporter(u'Logging in')
2069
2070     def report_video_webpage_download(self, video_id):
2071         """Report attempt to download video webpage."""
2072         self._reporter(u'%s: Downloading video webpage' % video_id)
2073
2074     def report_information_extraction(self, video_id):
2075         """Report attempt to extract video information."""
2076         self._reporter(u'%s: Extracting video information' % video_id)
2077
2078     def _parse_page(self, video_webpage):
2079         """Extract video information from page"""
2080         # General data
2081         data = {'title': r'\("video_title", "(.*?)"\)',
2082             'description': r'<div class="datawrap">(.*?)</div>',
2083             'owner': r'\("video_owner_name", "(.*?)"\)',
2084             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2085             }
2086         video_info = {}
2087         for piece in data.keys():
2088             mobj = re.search(data[piece], video_webpage)
2089             if mobj is not None:
2090                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2091
2092         # Video urls
2093         video_urls = {}
2094         for fmt in self._available_formats:
2095             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2096             if mobj is not None:
2097                 # URL is in a Javascript segment inside an escaped Unicode format within
2098                 # the generally utf-8 page
2099                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2100         video_info['video_urls'] = video_urls
2101
2102         return video_info
2103
2104     def _real_initialize(self):
2105         if self._downloader is None:
2106             return
2107
2108         useremail = None
2109         password = None
2110         downloader_params = self._downloader.params
2111
2112         # Attempt to use provided username and password or .netrc data
2113         if downloader_params.get('username', None) is not None:
2114             useremail = downloader_params['username']
2115             password = downloader_params['password']
2116         elif downloader_params.get('usenetrc', False):
2117             try:
2118                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2119                 if info is not None:
2120                     useremail = info[0]
2121                     password = info[2]
2122                 else:
2123                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2124             except (IOError, netrc.NetrcParseError) as err:
2125                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2126                 return
2127
2128         if useremail is None:
2129             return
2130
2131         # Log in
2132         login_form = {
2133             'email': useremail,
2134             'pass': password,
2135             'login': 'Log+In'
2136             }
2137         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2138         try:
2139             self.report_login()
2140             login_results = compat_urllib_request.urlopen(request).read()
2141             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2142                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2143                 return
2144         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2145             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2146             return
2147
2148     def _real_extract(self, url):
2149         mobj = re.match(self._VALID_URL, url)
2150         if mobj is None:
2151             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2152             return
2153         video_id = mobj.group('ID')
2154
2155         # Get video webpage
2156         self.report_video_webpage_download(video_id)
2157         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2158         try:
2159             page = compat_urllib_request.urlopen(request)
2160             video_webpage = page.read()
2161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2162             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2163             return
2164
2165         # Start extracting information
2166         self.report_information_extraction(video_id)
2167
2168         # Extract information
2169         video_info = self._parse_page(video_webpage)
2170
2171         # uploader
2172         if 'owner' not in video_info:
2173             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2174             return
2175         video_uploader = video_info['owner']
2176
2177         # title
2178         if 'title' not in video_info:
2179             self._downloader.trouble(u'ERROR: unable to extract video title')
2180             return
2181         video_title = video_info['title']
2182         video_title = video_title.decode('utf-8')
2183
2184         # thumbnail image
2185         if 'thumbnail' not in video_info:
2186             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2187             video_thumbnail = ''
2188         else:
2189             video_thumbnail = video_info['thumbnail']
2190
2191         # upload date
2192         upload_date = None
2193         if 'upload_date' in video_info:
2194             upload_time = video_info['upload_date']
2195             timetuple = email.utils.parsedate_tz(upload_time)
2196             if timetuple is not None:
2197                 try:
2198                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2199                 except:
2200                     pass
2201
2202         # description
2203         video_description = video_info.get('description', 'No description available.')
2204
2205         url_map = video_info['video_urls']
2206         if len(url_map.keys()) > 0:
2207             # Decide which formats to download
2208             req_format = self._downloader.params.get('format', None)
2209             format_limit = self._downloader.params.get('format_limit', None)
2210
2211             if format_limit is not None and format_limit in self._available_formats:
2212                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2213             else:
2214                 format_list = self._available_formats
2215             existing_formats = [x for x in format_list if x in url_map]
2216             if len(existing_formats) == 0:
2217                 self._downloader.trouble(u'ERROR: no known formats available for video')
2218                 return
2219             if req_format is None:
2220                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2221             elif req_format == 'worst':
2222                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2223             elif req_format == '-1':
2224                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2225             else:
2226                 # Specific format
2227                 if req_format not in url_map:
2228                     self._downloader.trouble(u'ERROR: requested format not available')
2229                     return
2230                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2231
2232         results = []
2233         for format_param, video_real_url in video_url_list:
2234             # Extension
2235             video_extension = self._video_extensions.get(format_param, 'mp4')
2236
2237             results.append({
2238                 'id':       video_id.decode('utf-8'),
2239                 'url':      video_real_url.decode('utf-8'),
2240                 'uploader': video_uploader.decode('utf-8'),
2241                 'upload_date':  upload_date,
2242                 'title':    video_title,
2243                 'ext':      video_extension.decode('utf-8'),
2244                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2245                 'thumbnail':    video_thumbnail.decode('utf-8'),
2246                 'description':  video_description.decode('utf-8'),
2247             })
2248         return results
2249
2250 class BlipTVIE(InfoExtractor):
2251     """Information extractor for blip.tv"""
2252
2253     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2254     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2255     IE_NAME = u'blip.tv'
2256
2257     def report_extraction(self, file_id):
2258         """Report information extraction."""
2259         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2260
2261     def report_direct_download(self, title):
2262         """Report information extraction."""
2263         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2264
2265     def _real_extract(self, url):
2266         mobj = re.match(self._VALID_URL, url)
2267         if mobj is None:
2268             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2269             return
2270
2271         if '?' in url:
2272             cchar = '&'
2273         else:
2274             cchar = '?'
2275         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2276         request = compat_urllib_request.Request(json_url)
2277         self.report_extraction(mobj.group(1))
2278         info = None
2279         try:
2280             urlh = compat_urllib_request.urlopen(request)
2281             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2282                 basename = url.split('/')[-1]
2283                 title,ext = os.path.splitext(basename)
2284                 title = title.decode('UTF-8')
2285                 ext = ext.replace('.', '')
2286                 self.report_direct_download(title)
2287                 info = {
2288                     'id': title,
2289                     'url': url,
2290                     'uploader': None,
2291                     'upload_date': None,
2292                     'title': title,
2293                     'ext': ext,
2294                     'urlhandle': urlh
2295                 }
2296         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2297             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2298             return
2299         if info is None: # Regular URL
2300             try:
2301                 json_code_bytes = urlh.read()
2302                 json_code = json_code_bytes.decode('utf-8')
2303             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2304                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2305                 return
2306
2307             try:
2308                 json_data = json.loads(json_code)
2309                 if 'Post' in json_data:
2310                     data = json_data['Post']
2311                 else:
2312                     data = json_data
2313
2314                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2315                 video_url = data['media']['url']
2316                 umobj = re.match(self._URL_EXT, video_url)
2317                 if umobj is None:
2318                     raise ValueError('Can not determine filename extension')
2319                 ext = umobj.group(1)
2320
2321                 info = {
2322                     'id': data['item_id'],
2323                     'url': video_url,
2324                     'uploader': data['display_name'],
2325                     'upload_date': upload_date,
2326                     'title': data['title'],
2327                     'ext': ext,
2328                     'format': data['media']['mimeType'],
2329                     'thumbnail': data['thumbnailUrl'],
2330                     'description': data['description'],
2331                     'player_url': data['embedUrl']
2332                 }
2333             except (ValueError,KeyError) as err:
2334                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2335                 return
2336
2337         std_headers['User-Agent'] = 'iTunes/10.6.1'
2338         return [info]
2339
2340
2341 class MyVideoIE(InfoExtractor):
2342     """Information Extractor for myvideo.de."""
2343
2344     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2345     IE_NAME = u'myvideo'
2346
2347     def __init__(self, downloader=None):
2348         InfoExtractor.__init__(self, downloader)
2349     
2350     def report_download_webpage(self, video_id):
2351         """Report webpage download."""
2352         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2353
2354     def report_extraction(self, video_id):
2355         """Report information extraction."""
2356         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2357
2358     def _real_extract(self,url):
2359         mobj = re.match(self._VALID_URL, url)
2360         if mobj is None:
2361             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2362             return
2363
2364         video_id = mobj.group(1)
2365
2366         # Get video webpage
2367         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2368         try:
2369             self.report_download_webpage(video_id)
2370             webpage = compat_urllib_request.urlopen(request).read()
2371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2372             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2373             return
2374
2375         self.report_extraction(video_id)
2376         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2377                  webpage)
2378         if mobj is None:
2379             self._downloader.trouble(u'ERROR: unable to extract media URL')
2380             return
2381         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2382
2383         mobj = re.search('<title>([^<]+)</title>', webpage)
2384         if mobj is None:
2385             self._downloader.trouble(u'ERROR: unable to extract title')
2386             return
2387
2388         video_title = mobj.group(1)
2389
2390         return [{
2391             'id':       video_id,
2392             'url':      video_url,
2393             'uploader': None,
2394             'upload_date':  None,
2395             'title':    video_title,
2396             'ext':      u'flv',
2397         }]
2398
2399 class ComedyCentralIE(InfoExtractor):
2400     """Information extractor for The Daily Show and Colbert Report """
2401
2402     # urls can be abbreviations like :thedailyshow or :colbert
2403     # urls for episodes like: 
2404     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2405     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2406     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524    
2407     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2408                       |(https?://)?(www\.)?
2409                           (?P<showname>thedailyshow|colbertnation)\.com/
2410                          (full-episodes/(?P<episode>.*)|
2411                           (?P<clip>
2412                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2413                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2414                      $"""                        
2415     IE_NAME = u'comedycentral'
2416
2417     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2418
2419     _video_extensions = {
2420         '3500': 'mp4',
2421         '2200': 'mp4',
2422         '1700': 'mp4',
2423         '1200': 'mp4',
2424         '750': 'mp4',
2425         '400': 'mp4',
2426     }
2427     _video_dimensions = {
2428         '3500': '1280x720',
2429         '2200': '960x540',
2430         '1700': '768x432',
2431         '1200': '640x360',
2432         '750': '512x288',
2433         '400': '384x216',
2434     }
2435
2436     def suitable(self, url):
2437         """Receives a URL and returns True if suitable for this IE."""
2438         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2439
2440     def report_extraction(self, episode_id):
2441         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2442
2443     def report_config_download(self, episode_id):
2444         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2445
2446     def report_index_download(self, episode_id):
2447         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2448
2449     def report_player_url(self, episode_id):
2450         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2451
2452
2453     def _print_formats(self, formats):
2454         print('Available formats:')
2455         for x in formats:
2456             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2457
2458
2459     def _real_extract(self, url):
2460         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2461         if mobj is None:
2462             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2463             return
2464
2465         if mobj.group('shortname'):
2466             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2467                 url = u'http://www.thedailyshow.com/full-episodes/'
2468             else:
2469                 url = u'http://www.colbertnation.com/full-episodes/'
2470             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2471             assert mobj is not None
2472
2473         if mobj.group('clip'):
2474             if mobj.group('showname') == 'thedailyshow':
2475                 epTitle = mobj.group('tdstitle')
2476             else:
2477                 epTitle = mobj.group('cntitle')
2478             dlNewest = False
2479         else:
2480             dlNewest = not mobj.group('episode')
2481             if dlNewest:
2482                 epTitle = mobj.group('showname')
2483             else:
2484                 epTitle = mobj.group('episode')
2485
2486         req = compat_urllib_request.Request(url)
2487         self.report_extraction(epTitle)
2488         try:
2489             htmlHandle = compat_urllib_request.urlopen(req)
2490             html = htmlHandle.read()
2491         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2493             return
2494         if dlNewest:
2495             url = htmlHandle.geturl()
2496             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2497             if mobj is None:
2498                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2499                 return
2500             if mobj.group('episode') == '':
2501                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2502                 return
2503             epTitle = mobj.group('episode')
2504
2505         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2506
2507         if len(mMovieParams) == 0:
2508             # The Colbert Report embeds the information in a without
2509             # a URL prefix; so extract the alternate reference
2510             # and then add the URL prefix manually.
2511
2512             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2513             if len(altMovieParams) == 0:
2514                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2515                 return
2516             else:
2517                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2518         
2519         playerUrl_raw = mMovieParams[0][0]
2520         self.report_player_url(epTitle)
2521         try:
2522             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2523             playerUrl = urlHandle.geturl()
2524         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2526             return
2527
2528         uri = mMovieParams[0][1]
2529         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2530         self.report_index_download(epTitle)
2531         try:
2532             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2533         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2534             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2535             return
2536
2537         results = []
2538
2539         idoc = xml.etree.ElementTree.fromstring(indexXml)
2540         itemEls = idoc.findall('.//item')
2541         for itemEl in itemEls:
2542             mediaId = itemEl.findall('./guid')[0].text
2543             shortMediaId = mediaId.split(':')[-1]
2544             showId = mediaId.split(':')[-2].replace('.com', '')
2545             officialTitle = itemEl.findall('./title')[0].text
2546             officialDate = itemEl.findall('./pubDate')[0].text
2547
2548             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2549                         compat_urllib_parse.urlencode({'uri': mediaId}))
2550             configReq = compat_urllib_request.Request(configUrl)
2551             self.report_config_download(epTitle)
2552             try:
2553                 configXml = compat_urllib_request.urlopen(configReq).read()
2554             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2555                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2556                 return
2557
2558             cdoc = xml.etree.ElementTree.fromstring(configXml)
2559             turls = []
2560             for rendition in cdoc.findall('.//rendition'):
2561                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2562                 turls.append(finfo)
2563
2564             if len(turls) == 0:
2565                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2566                 continue
2567             
2568             if self._downloader.params.get('listformats', None):
2569                 self._print_formats([i[0] for i in turls])
2570                 return
2571
2572             # For now, just pick the highest bitrate
2573             format,video_url = turls[-1]
2574
2575             # Get the format arg from the arg stream
2576             req_format = self._downloader.params.get('format', None)
2577
2578             # Select format if we can find one
2579             for f,v in turls:
2580                 if f == req_format:
2581                     format, video_url = f, v
2582                     break
2583
2584             # Patch to download from alternative CDN, which does not
2585             # break on current RTMPDump builds
2586             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2587             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2588
2589             if video_url.startswith(broken_cdn):
2590                 video_url = video_url.replace(broken_cdn, better_cdn)
2591
2592             effTitle = showId + u'-' + epTitle
2593             info = {
2594                 'id': shortMediaId,
2595                 'url': video_url,
2596                 'uploader': showId,
2597                 'upload_date': officialDate,
2598                 'title': effTitle,
2599                 'ext': 'mp4',
2600                 'format': format,
2601                 'thumbnail': None,
2602                 'description': officialTitle,
2603                 'player_url': None #playerUrl
2604             }
2605
2606             results.append(info)
2607             
2608         return results
2609
2610
2611 class EscapistIE(InfoExtractor):
2612     """Information extractor for The Escapist """
2613
2614     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2615     IE_NAME = u'escapist'
2616
2617     def report_extraction(self, showName):
2618         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2619
2620     def report_config_download(self, showName):
2621         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2622
2623     def _real_extract(self, url):
2624         mobj = re.match(self._VALID_URL, url)
2625         if mobj is None:
2626             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2627             return
2628         showName = mobj.group('showname')
2629         videoId = mobj.group('episode')
2630
2631         self.report_extraction(showName)
2632         try:
2633             webPage = compat_urllib_request.urlopen(url)
2634             webPageBytes = webPage.read()
2635             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2636             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2637         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2638             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2639             return
2640
2641         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2642         description = unescapeHTML(descMatch.group(1))
2643         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2644         imgUrl = unescapeHTML(imgMatch.group(1))
2645         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2646         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2647         configUrlMatch = re.search('config=(.*)$', playerUrl)
2648         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2649
2650         self.report_config_download(showName)
2651         try:
2652             configJSON = compat_urllib_request.urlopen(configUrl).read()
2653         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2654             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2655             return
2656
2657         # Technically, it's JavaScript, not JSON
2658         configJSON = configJSON.replace("'", '"')
2659
2660         try:
2661             config = json.loads(configJSON)
2662         except (ValueError,) as err:
2663             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2664             return
2665
2666         playlist = config['playlist']
2667         videoUrl = playlist[1]['url']
2668
2669         info = {
2670             'id': videoId,
2671             'url': videoUrl,
2672             'uploader': showName,
2673             'upload_date': None,
2674             'title': showName,
2675             'ext': 'flv',
2676             'thumbnail': imgUrl,
2677             'description': description,
2678             'player_url': playerUrl,
2679         }
2680
2681         return [info]
2682
2683
2684 class CollegeHumorIE(InfoExtractor):
2685     """Information extractor for collegehumor.com"""
2686
2687     _WORKING = False
2688     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2689     IE_NAME = u'collegehumor'
2690
2691     def report_manifest(self, video_id):
2692         """Report information extraction."""
2693         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2694
2695     def report_extraction(self, video_id):
2696         """Report information extraction."""
2697         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698
2699     def _real_extract(self, url):
2700         mobj = re.match(self._VALID_URL, url)
2701         if mobj is None:
2702             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703             return
2704         video_id = mobj.group('videoid')
2705
2706         info = {
2707             'id': video_id,
2708             'uploader': None,
2709             'upload_date': None,
2710         }
2711
2712         self.report_extraction(video_id)
2713         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2714         try:
2715             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2716         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2717             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2718             return
2719
2720         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2721         try:
2722             videoNode = mdoc.findall('./video')[0]
2723             info['description'] = videoNode.findall('./description')[0].text
2724             info['title'] = videoNode.findall('./caption')[0].text
2725             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2726             manifest_url = videoNode.findall('./file')[0].text
2727         except IndexError:
2728             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2729             return
2730
2731         manifest_url += '?hdcore=2.10.3'
2732         self.report_manifest(video_id)
2733         try:
2734             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2735         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2737             return
2738
2739         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2740         try:
2741             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2742             node_id = media_node.attrib['url']
2743             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2744         except IndexError as err:
2745             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2746             return
2747
2748         url_pr = compat_urllib_parse_urlparse(manifest_url)
2749         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2750
2751         info['url'] = url
2752         info['ext'] = 'f4f'
2753         return [info]
2754
2755
2756 class XVideosIE(InfoExtractor):
2757     """Information extractor for xvideos.com"""
2758
2759     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2760     IE_NAME = u'xvideos'
2761
2762     def report_webpage(self, video_id):
2763         """Report information extraction."""
2764         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2765
2766     def report_extraction(self, video_id):
2767         """Report information extraction."""
2768         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2769
2770     def _real_extract(self, url):
2771         mobj = re.match(self._VALID_URL, url)
2772         if mobj is None:
2773             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774             return
2775         video_id = mobj.group(1)
2776
2777         self.report_webpage(video_id)
2778
2779         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2780         try:
2781             webpage_bytes = compat_urllib_request.urlopen(request).read()
2782             webpage = webpage_bytes.decode('utf-8', 'replace')
2783         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2784             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2785             return
2786
2787         self.report_extraction(video_id)
2788
2789
2790         # Extract video URL
2791         mobj = re.search(r'flv_url=(.+?)&', webpage)
2792         if mobj is None:
2793             self._downloader.trouble(u'ERROR: unable to extract video url')
2794             return
2795         video_url = compat_urllib_parse.unquote(mobj.group(1))
2796
2797
2798         # Extract title
2799         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2800         if mobj is None:
2801             self._downloader.trouble(u'ERROR: unable to extract video title')
2802             return
2803         video_title = mobj.group(1)
2804
2805
2806         # Extract video thumbnail
2807         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2808         if mobj is None:
2809             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2810             return
2811         video_thumbnail = mobj.group(0)
2812
2813         info = {
2814             'id': video_id,
2815             'url': video_url,
2816             'uploader': None,
2817             'upload_date': None,
2818             'title': video_title,
2819             'ext': 'flv',
2820             'thumbnail': video_thumbnail,
2821             'description': None,
2822         }
2823
2824         return [info]
2825
2826
2827 class SoundcloudIE(InfoExtractor):
2828     """Information extractor for soundcloud.com
2829        To access the media, the uid of the song and a stream token
2830        must be extracted from the page source and the script must make
2831        a request to media.soundcloud.com/crossdomain.xml. Then
2832        the media can be grabbed by requesting from an url composed
2833        of the stream token and uid
2834      """
2835
2836     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2837     IE_NAME = u'soundcloud'
2838
2839     def __init__(self, downloader=None):
2840         InfoExtractor.__init__(self, downloader)
2841
2842     def report_resolve(self, video_id):
2843         """Report information extraction."""
2844         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2845
2846     def report_extraction(self, video_id):
2847         """Report information extraction."""
2848         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2849
2850     def _real_extract(self, url):
2851         mobj = re.match(self._VALID_URL, url)
2852         if mobj is None:
2853             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2854             return
2855
2856         # extract uploader (which is in the url)
2857         uploader = mobj.group(1)
2858         # extract simple title (uploader + slug of song title)
2859         slug_title =  mobj.group(2)
2860         simple_title = uploader + u'-' + slug_title
2861
2862         self.report_resolve('%s/%s' % (uploader, slug_title))
2863
2864         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2865         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2866         request = compat_urllib_request.Request(resolv_url)
2867         try:
2868             info_json_bytes = compat_urllib_request.urlopen(request).read()
2869             info_json = info_json_bytes.decode('utf-8')
2870         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2872             return
2873
2874         info = json.loads(info_json)
2875         video_id = info['id']
2876         self.report_extraction('%s/%s' % (uploader, slug_title))
2877
2878         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2879         request = compat_urllib_request.Request(streams_url)
2880         try:
2881             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2882             stream_json = stream_json_bytes.decode('utf-8')
2883         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2885             return
2886
2887         streams = json.loads(stream_json)
2888         mediaURL = streams['http_mp3_128_url']
2889
2890         return [{
2891             'id':       info['id'],
2892             'url':      mediaURL,
2893             'uploader': info['user']['username'],
2894             'upload_date':  info['created_at'],
2895             'title':    info['title'],
2896             'ext':      u'mp3',
2897             'description': info['description'],
2898         }]
2899
2900
2901 class InfoQIE(InfoExtractor):
2902     """Information extractor for infoq.com"""
2903
2904     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2905     IE_NAME = u'infoq'
2906
2907     def report_webpage(self, video_id):
2908         """Report information extraction."""
2909         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2910
2911     def report_extraction(self, video_id):
2912         """Report information extraction."""
2913         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2914
2915     def _real_extract(self, url):
2916         mobj = re.match(self._VALID_URL, url)
2917         if mobj is None:
2918             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2919             return
2920
2921         self.report_webpage(url)
2922
2923         request = compat_urllib_request.Request(url)
2924         try:
2925             webpage = compat_urllib_request.urlopen(request).read()
2926         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2928             return
2929
2930         self.report_extraction(url)
2931
2932
2933         # Extract video URL
2934         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2935         if mobj is None:
2936             self._downloader.trouble(u'ERROR: unable to extract video url')
2937             return
2938         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2939
2940
2941         # Extract title
2942         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2943         if mobj is None:
2944             self._downloader.trouble(u'ERROR: unable to extract video title')
2945             return
2946         video_title = mobj.group(1).decode('utf-8')
2947
2948         # Extract description
2949         video_description = u'No description available.'
2950         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2951         if mobj is not None:
2952             video_description = mobj.group(1).decode('utf-8')
2953
2954         video_filename = video_url.split('/')[-1]
2955         video_id, extension = video_filename.split('.')
2956
2957         info = {
2958             'id': video_id,
2959             'url': video_url,
2960             'uploader': None,
2961             'upload_date': None,
2962             'title': video_title,
2963             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2964             'thumbnail': None,
2965             'description': video_description,
2966         }
2967
2968         return [info]
2969
2970 class MixcloudIE(InfoExtractor):
2971     """Information extractor for www.mixcloud.com"""
2972     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2973     IE_NAME = u'mixcloud'
2974
2975     def __init__(self, downloader=None):
2976         InfoExtractor.__init__(self, downloader)
2977
2978     def report_download_json(self, file_id):
2979         """Report JSON download."""
2980         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2981
2982     def report_extraction(self, file_id):
2983         """Report information extraction."""
2984         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2985
2986     def get_urls(self, jsonData, fmt, bitrate='best'):
2987         """Get urls from 'audio_formats' section in json"""
2988         file_url = None
2989         try:
2990             bitrate_list = jsonData[fmt]
2991             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2992                 bitrate = max(bitrate_list) # select highest
2993
2994             url_list = jsonData[fmt][bitrate]
2995         except TypeError: # we have no bitrate info.
2996             url_list = jsonData[fmt]
2997         return url_list
2998
2999     def check_urls(self, url_list):
3000         """Returns 1st active url from list"""
3001         for url in url_list:
3002             try:
3003                 compat_urllib_request.urlopen(url)
3004                 return url
3005             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3006                 url = None
3007
3008         return None
3009
3010     def _print_formats(self, formats):
3011         print('Available formats:')
3012         for fmt in formats.keys():
3013             for b in formats[fmt]:
3014                 try:
3015                     ext = formats[fmt][b][0]
3016                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3017                 except TypeError: # we have no bitrate info
3018                     ext = formats[fmt][0]
3019                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3020                     break
3021
3022     def _real_extract(self, url):
3023         mobj = re.match(self._VALID_URL, url)
3024         if mobj is None:
3025             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3026             return
3027         # extract uploader & filename from url
3028         uploader = mobj.group(1).decode('utf-8')
3029         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3030
3031         # construct API request
3032         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3033         # retrieve .json file with links to files
3034         request = compat_urllib_request.Request(file_url)
3035         try:
3036             self.report_download_json(file_url)
3037             jsonData = compat_urllib_request.urlopen(request).read()
3038         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3039             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3040             return
3041
3042         # parse JSON
3043         json_data = json.loads(jsonData)
3044         player_url = json_data['player_swf_url']
3045         formats = dict(json_data['audio_formats'])
3046
3047         req_format = self._downloader.params.get('format', None)
3048         bitrate = None
3049
3050         if self._downloader.params.get('listformats', None):
3051             self._print_formats(formats)
3052             return
3053
3054         if req_format is None or req_format == 'best':
3055             for format_param in formats.keys():
3056                 url_list = self.get_urls(formats, format_param)
3057                 # check urls
3058                 file_url = self.check_urls(url_list)
3059                 if file_url is not None:
3060                     break # got it!
3061         else:
3062             if req_format not in formats.keys():
3063                 self._downloader.trouble(u'ERROR: format is not available')
3064                 return
3065
3066             url_list = self.get_urls(formats, req_format)
3067             file_url = self.check_urls(url_list)
3068             format_param = req_format
3069
3070         return [{
3071             'id': file_id.decode('utf-8'),
3072             'url': file_url.decode('utf-8'),
3073             'uploader': uploader.decode('utf-8'),
3074             'upload_date': None,
3075             'title': json_data['name'],
3076             'ext': file_url.split('.')[-1].decode('utf-8'),
3077             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3078             'thumbnail': json_data['thumbnail_url'],
3079             'description': json_data['description'],
3080             'player_url': player_url.decode('utf-8'),
3081         }]
3082
3083 class StanfordOpenClassroomIE(InfoExtractor):
3084     """Information extractor for Stanford's Open ClassRoom"""
3085
3086     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3087     IE_NAME = u'stanfordoc'
3088
3089     def report_download_webpage(self, objid):
3090         """Report information extraction."""
3091         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3092
3093     def report_extraction(self, video_id):
3094         """Report information extraction."""
3095         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3096
3097     def _real_extract(self, url):
3098         mobj = re.match(self._VALID_URL, url)
3099         if mobj is None:
3100             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3101             return
3102
3103         if mobj.group('course') and mobj.group('video'): # A specific video
3104             course = mobj.group('course')
3105             video = mobj.group('video')
3106             info = {
3107                 'id': course + '_' + video,
3108                 'uploader': None,
3109                 'upload_date': None,
3110             }
3111
3112             self.report_extraction(info['id'])
3113             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3114             xmlUrl = baseUrl + video + '.xml'
3115             try:
3116                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3117             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3118                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3119                 return
3120             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3121             try:
3122                 info['title'] = mdoc.findall('./title')[0].text
3123                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3124             except IndexError:
3125                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3126                 return
3127             info['ext'] = info['url'].rpartition('.')[2]
3128             return [info]
3129         elif mobj.group('course'): # A course page
3130             course = mobj.group('course')
3131             info = {
3132                 'id': course,
3133                 'type': 'playlist',
3134                 'uploader': None,
3135                 'upload_date': None,
3136             }
3137
3138             self.report_download_webpage(info['id'])
3139             try:
3140                 coursepage = compat_urllib_request.urlopen(url).read()
3141             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3143                 return
3144
3145             m = re.search('<h1>([^<]+)</h1>', coursepage)
3146             if m:
3147                 info['title'] = unescapeHTML(m.group(1))
3148             else:
3149                 info['title'] = info['id']
3150
3151             m = re.search('<description>([^<]+)</description>', coursepage)
3152             if m:
3153                 info['description'] = unescapeHTML(m.group(1))
3154
3155             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3156             info['list'] = [
3157                 {
3158                     'type': 'reference',
3159                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3160                 }
3161                     for vpage in links]
3162             results = []
3163             for entry in info['list']:
3164                 assert entry['type'] == 'reference'
3165                 results += self.extract(entry['url'])
3166             return results
3167             
3168         else: # Root page
3169             info = {
3170                 'id': 'Stanford OpenClassroom',
3171                 'type': 'playlist',
3172                 'uploader': None,
3173                 'upload_date': None,
3174             }
3175
3176             self.report_download_webpage(info['id'])
3177             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3178             try:
3179                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3180             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3181                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3182                 return
3183
3184             info['title'] = info['id']
3185
3186             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3187             info['list'] = [
3188                 {
3189                     'type': 'reference',
3190                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3191                 }
3192                     for cpage in links]
3193
3194             results = []
3195             for entry in info['list']:
3196                 assert entry['type'] == 'reference'
3197                 results += self.extract(entry['url'])
3198             return results
3199
3200 class MTVIE(InfoExtractor):
3201     """Information extractor for MTV.com"""
3202
3203     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3204     IE_NAME = u'mtv'
3205
3206     def report_webpage(self, video_id):
3207         """Report information extraction."""
3208         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3209
3210     def report_extraction(self, video_id):
3211         """Report information extraction."""
3212         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3213
3214     def _real_extract(self, url):
3215         mobj = re.match(self._VALID_URL, url)
3216         if mobj is None:
3217             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3218             return
3219         if not mobj.group('proto'):
3220             url = 'http://' + url
3221         video_id = mobj.group('videoid')
3222         self.report_webpage(video_id)
3223
3224         request = compat_urllib_request.Request(url)
3225         try:
3226             webpage = compat_urllib_request.urlopen(request).read()
3227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3229             return
3230
3231         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3232         if mobj is None:
3233             self._downloader.trouble(u'ERROR: unable to extract song name')
3234             return
3235         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3236         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3237         if mobj is None:
3238             self._downloader.trouble(u'ERROR: unable to extract performer')
3239             return
3240         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3241         video_title = performer + ' - ' + song_name 
3242
3243         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3244         if mobj is None:
3245             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3246             return
3247         mtvn_uri = mobj.group(1)
3248
3249         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3250         if mobj is None:
3251             self._downloader.trouble(u'ERROR: unable to extract content id')
3252             return
3253         content_id = mobj.group(1)
3254
3255         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3256         self.report_extraction(video_id)
3257         request = compat_urllib_request.Request(videogen_url)
3258         try:
3259             metadataXml = compat_urllib_request.urlopen(request).read()
3260         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3261             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3262             return
3263
3264         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3265         renditions = mdoc.findall('.//rendition')
3266
3267         # For now, always pick the highest quality.
3268         rendition = renditions[-1]
3269
3270         try:
3271             _,_,ext = rendition.attrib['type'].partition('/')
3272             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3273             video_url = rendition.find('./src').text
3274         except KeyError:
3275             self._downloader.trouble('Invalid rendition field.')
3276             return
3277
3278         info = {
3279             'id': video_id,
3280             'url': video_url,
3281             'uploader': performer,
3282             'upload_date': None,
3283             'title': video_title,
3284             'ext': ext,
3285             'format': format,
3286         }
3287
3288         return [info]
3289
3290
3291 class YoukuIE(InfoExtractor):
3292
3293     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3294     IE_NAME = u'Youku'
3295
3296     def __init__(self, downloader=None):
3297         InfoExtractor.__init__(self, downloader)
3298
3299     def report_download_webpage(self, file_id):
3300         """Report webpage download."""
3301         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3302
3303     def report_extraction(self, file_id):
3304         """Report information extraction."""
3305         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3306
3307     def _gen_sid(self):
3308         nowTime = int(time.time() * 1000)
3309         random1 = random.randint(1000,1998)
3310         random2 = random.randint(1000,9999)
3311
3312         return "%d%d%d" %(nowTime,random1,random2)
3313
3314     def _get_file_ID_mix_string(self, seed):
3315         mixed = []
3316         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3317         seed = float(seed)
3318         for i in range(len(source)):
3319             seed  =  (seed * 211 + 30031 ) % 65536
3320             index  =  math.floor(seed / 65536 * len(source) )
3321             mixed.append(source[int(index)])
3322             source.remove(source[int(index)])
3323         #return ''.join(mixed)
3324         return mixed
3325
3326     def _get_file_id(self, fileId, seed):
3327         mixed = self._get_file_ID_mix_string(seed)
3328         ids = fileId.split('*')
3329         realId = []
3330         for ch in ids:
3331             if ch:
3332                 realId.append(mixed[int(ch)])
3333         return ''.join(realId)
3334
3335     def _real_extract(self, url):
3336         mobj = re.match(self._VALID_URL, url)
3337         if mobj is None:
3338             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3339             return
3340         video_id = mobj.group('ID')
3341
3342         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3343
3344         request = compat_urllib_request.Request(info_url, None, std_headers)
3345         try:
3346             self.report_download_webpage(video_id)
3347             jsondata = compat_urllib_request.urlopen(request).read()
3348         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3349             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3350             return
3351
3352         self.report_extraction(video_id)
3353         try:
3354             jsonstr = jsondata.decode('utf-8')
3355             config = json.loads(jsonstr)
3356
3357             video_title =  config['data'][0]['title']
3358             seed = config['data'][0]['seed']
3359
3360             format = self._downloader.params.get('format', None)
3361             supported_format = config['data'][0]['streamfileids'].keys()
3362
3363             if format is None or format == 'best':
3364                 if 'hd2' in supported_format:
3365                     format = 'hd2'
3366                 else:
3367                     format = 'flv'
3368                 ext = u'flv'
3369             elif format == 'worst':
3370                 format = 'mp4'
3371                 ext = u'mp4'
3372             else:
3373                 format = 'flv'
3374                 ext = u'flv'
3375
3376
3377             fileid = config['data'][0]['streamfileids'][format]
3378             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3379         except (UnicodeDecodeError, ValueError, KeyError):
3380             self._downloader.trouble(u'ERROR: unable to extract info section')
3381             return
3382
3383         files_info=[]
3384         sid = self._gen_sid()
3385         fileid = self._get_file_id(fileid, seed)
3386
3387         #column 8,9 of fileid represent the segment number
3388         #fileid[7:9] should be changed
3389         for index, key in enumerate(keys):
3390
3391             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3392             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3393
3394             info = {
3395                 'id': '%s_part%02d' % (video_id, index),
3396                 'url': download_url,
3397                 'uploader': None,
3398                 'upload_date': None,
3399                 'title': video_title,
3400                 'ext': ext,
3401             }
3402             files_info.append(info)
3403
3404         return files_info
3405
3406
3407 class XNXXIE(InfoExtractor):
3408     """Information extractor for xnxx.com"""
3409
3410     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3411     IE_NAME = u'xnxx'
3412     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3413     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3414     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3415
3416     def report_webpage(self, video_id):
3417         """Report information extraction"""
3418         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3419
3420     def report_extraction(self, video_id):
3421         """Report information extraction"""
3422         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3423
3424     def _real_extract(self, url):
3425         mobj = re.match(self._VALID_URL, url)
3426         if mobj is None:
3427             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428             return
3429         video_id = mobj.group(1)
3430
3431         self.report_webpage(video_id)
3432
3433         # Get webpage content
3434         try:
3435             webpage_bytes = compat_urllib_request.urlopen(url).read()
3436             webpage = webpage_bytes.decode('utf-8')
3437         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3438             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3439             return
3440
3441         result = re.search(self.VIDEO_URL_RE, webpage)
3442         if result is None:
3443             self._downloader.trouble(u'ERROR: unable to extract video url')
3444             return
3445         video_url = compat_urllib_parse.unquote(result.group(1))
3446
3447         result = re.search(self.VIDEO_TITLE_RE, webpage)
3448         if result is None:
3449             self._downloader.trouble(u'ERROR: unable to extract video title')
3450             return
3451         video_title = result.group(1)
3452
3453         result = re.search(self.VIDEO_THUMB_RE, webpage)
3454         if result is None:
3455             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3456             return
3457         video_thumbnail = result.group(1)
3458
3459         return [{
3460             'id': video_id,
3461             'url': video_url,
3462             'uploader': None,
3463             'upload_date': None,
3464             'title': video_title,
3465             'ext': 'flv',
3466             'thumbnail': video_thumbnail,
3467             'description': None,
3468         }]
3469
3470
3471 class GooglePlusIE(InfoExtractor):
3472     """Information extractor for plus.google.com."""
3473
3474     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3475     IE_NAME = u'plus.google'
3476
3477     def __init__(self, downloader=None):
3478         InfoExtractor.__init__(self, downloader)
3479
3480     def report_extract_entry(self, url):
3481         """Report downloading extry"""
3482         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3483
3484     def report_date(self, upload_date):
3485         """Report downloading extry"""
3486         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3487
3488     def report_uploader(self, uploader):
3489         """Report downloading extry"""
3490         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3491
3492     def report_title(self, video_title):
3493         """Report downloading extry"""
3494         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3495
3496     def report_extract_vid_page(self, video_page):
3497         """Report information extraction."""
3498         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3499
3500     def _real_extract(self, url):
3501         # Extract id from URL
3502         mobj = re.match(self._VALID_URL, url)
3503         if mobj is None:
3504             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3505             return
3506
3507         post_url = mobj.group(0)
3508         video_id = mobj.group(2)
3509
3510         video_extension = 'flv'
3511
3512         # Step 1, Retrieve post webpage to extract further information
3513         self.report_extract_entry(post_url)
3514         request = compat_urllib_request.Request(post_url)
3515         try:
3516             webpage = compat_urllib_request.urlopen(request).read()
3517         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3518             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3519             return
3520
3521         # Extract update date
3522         upload_date = None
3523         pattern = 'title="Timestamp">(.*?)</a>'
3524         mobj = re.search(pattern, webpage)
3525         if mobj:
3526             upload_date = mobj.group(1)
3527             # Convert timestring to a format suitable for filename
3528             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3529             upload_date = upload_date.strftime('%Y%m%d')
3530         self.report_date(upload_date)
3531
3532         # Extract uploader
3533         uploader = None
3534         pattern = r'rel\="author".*?>(.*?)</a>'
3535         mobj = re.search(pattern, webpage)
3536         if mobj:
3537             uploader = mobj.group(1)
3538         self.report_uploader(uploader)
3539
3540         # Extract title
3541         # Get the first line for title
3542         video_title = u'NA'
3543         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3544         mobj = re.search(pattern, webpage)
3545         if mobj:
3546             video_title = mobj.group(1)
3547         self.report_title(video_title)
3548
3549         # Step 2, Stimulate clicking the image box to launch video
3550         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3551         mobj = re.search(pattern, webpage)
3552         if mobj is None:
3553             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3554
3555         video_page = mobj.group(1)
3556         request = compat_urllib_request.Request(video_page)
3557         try:
3558             webpage = compat_urllib_request.urlopen(request).read()
3559         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3560             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3561             return
3562         self.report_extract_vid_page(video_page)
3563
3564
3565         # Extract video links on video page
3566         """Extract video links of all sizes"""
3567         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3568         mobj = re.findall(pattern, webpage)
3569         if len(mobj) == 0:
3570             self._downloader.trouble(u'ERROR: unable to extract video links')
3571
3572         # Sort in resolution
3573         links = sorted(mobj)
3574
3575         # Choose the lowest of the sort, i.e. highest resolution
3576         video_url = links[-1]
3577         # Only get the url. The resolution part in the tuple has no use anymore
3578         video_url = video_url[-1]
3579         # Treat escaped \u0026 style hex
3580         video_url = unicode(video_url, "unicode_escape")
3581
3582
3583         return [{
3584             'id':       video_id.decode('utf-8'),
3585             'url':      video_url,
3586             'uploader': uploader.decode('utf-8'),
3587             'upload_date':  upload_date.decode('utf-8'),
3588             'title':    video_title.decode('utf-8'),
3589             'ext':      video_extension.decode('utf-8'),
3590         }]
3591
3592 class NBAIE(InfoExtractor):
3593     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3594     IE_NAME = u'nba'
3595
3596     def report_extraction(self, video_id):
3597         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3598
3599     def _real_extract(self, url):
3600         mobj = re.match(self._VALID_URL, url)
3601         if mobj is None:
3602             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3603             return
3604
3605         video_id = mobj.group(1)
3606         if video_id.endswith('/index.html'):
3607             video_id = video_id[:-len('/index.html')]
3608
3609         self.report_extraction(video_id)
3610         try:
3611             urlh = compat_urllib_request.urlopen(url)
3612             webpage_bytes = urlh.read()
3613             webpage = webpage_bytes.decode('utf-8', 'ignore')
3614         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3615             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3616             return
3617
3618         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3619         def _findProp(rexp, default=None):
3620             m = re.search(rexp, webpage)
3621             if m:
3622                 return unescapeHTML(m.group(1))
3623             else:
3624                 return default
3625
3626         shortened_video_id = video_id.rpartition('/')[2]
3627         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3628         info = {
3629             'id': shortened_video_id,
3630             'url': video_url,
3631             'ext': 'mp4',
3632             'title': title,
3633             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3634             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3635         }
3636         return [info]
3637
3638 class JustinTVIE(InfoExtractor):
3639     """Information extractor for justin.tv and twitch.tv"""
3640     # TODO: One broadcast may be split into multiple videos. The key
3641     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3642     # starts at 1 and increases. Can we treat all parts as one video?
3643
3644     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3645         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3646     _JUSTIN_PAGE_LIMIT = 100
3647     IE_NAME = u'justin.tv'
3648
3649     def report_extraction(self, file_id):
3650         """Report information extraction."""
3651         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3652
3653     def report_download_page(self, channel, offset):
3654         """Report attempt to download a single page of videos."""
3655         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3656                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3657
3658     # Return count of items, list of *valid* items
3659     def _parse_page(self, url):
3660         try:
3661             urlh = compat_urllib_request.urlopen(url)
3662             webpage_bytes = urlh.read()
3663             webpage = webpage_bytes.decode('utf-8', 'ignore')
3664         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3665             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3666             return
3667         
3668         response = json.loads(webpage)
3669         info = []
3670         for clip in response:
3671             video_url = clip['video_file_url']
3672             if video_url:
3673                 video_extension = os.path.splitext(video_url)[1][1:]
3674                 video_date = re.sub('-', '', clip['created_on'][:10])
3675                 info.append({
3676                     'id': clip['id'],
3677                     'url': video_url,
3678                     'title': clip['title'],
3679                     'uploader': clip.get('user_id', clip.get('channel_id')),
3680                     'upload_date': video_date,
3681                     'ext': video_extension,
3682                 })
3683         return (len(response), info)
3684
3685     def _real_extract(self, url):
3686         mobj = re.match(self._VALID_URL, url)
3687         if mobj is None:
3688             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3689             return
3690         
3691         api = 'http://api.justin.tv'
3692         video_id = mobj.group(mobj.lastindex)
3693         paged = False
3694         if mobj.lastindex == 1:
3695             paged = True
3696             api += '/channel/archives/%s.json'
3697         else:
3698             api += '/clip/show/%s.json'
3699         api = api % (video_id,)
3700         
3701         self.report_extraction(video_id)
3702         
3703         info = []
3704         offset = 0
3705         limit = self._JUSTIN_PAGE_LIMIT
3706         while True:
3707             if paged:
3708                 self.report_download_page(video_id, offset)
3709             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3710             page_count, page_info = self._parse_page(page_url)
3711             info.extend(page_info)
3712             if not paged or page_count != limit:
3713                 break
3714             offset += limit
3715         return info