Add location field
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import netrc
9 import os
10 import re
11 import socket
12 import time
13 import email.utils
14 import xml.etree.ElementTree
15 import random
16 import math
17
18 from .utils import *
19
20
21 class InfoExtractor(object):
22     """Information Extractor class.
23
24     Information extractors are the classes that, given a URL, extract
25     information about the video (or videos) the URL refers to. This
26     information includes the real video URL, the video title, author and
27     others. The information is stored in a dictionary which is then
28     passed to the FileDownloader. The FileDownloader processes this
29     information possibly downloading the video to the file system, among
30     other possible outcomes.
31
32     The dictionaries must include the following fields:
33
34     id:             Video identifier.
35     url:            Final video URL.
36     title:          Video title, unescaped.
37     ext:            Video filename extension.
38
39     The following fields are optional:
40
41     format:         The video format, defaults to ext (used for --get-format)
42     thumbnail:      Full URL to a video thumbnail image.
43     description:    One-line video description.
44     uploader:       Full name of the video uploader.
45     upload_date:    Video upload date (YYYYMMDD).
46     uploader_id:    Nickname or id of the video uploader.
47     location:       Physical location of the video.
48     player_url:     SWF Player URL (used for rtmpdump).
49     subtitles:      The .srt file contents.
50     urlhandle:      [internal] The urlHandle to be used to download the file,
51                     like returned by urllib.request.urlopen
52
53     The fields should all be Unicode strings.
54
55     Subclasses of this one should re-define the _real_initialize() and
56     _real_extract() methods and define a _VALID_URL regexp.
57     Probably, they should also be added to the list of extractors.
58
59     _real_extract() must return a *list* of information dictionaries as
60     described above.
61
62     Finally, the _WORKING attribute should be set to False for broken IEs
63     in order to warn the users and skip the tests.
64     """
65
66     _ready = False
67     _downloader = None
68     _WORKING = True
69
70     def __init__(self, downloader=None):
71         """Constructor. Receives an optional downloader."""
72         self._ready = False
73         self.set_downloader(downloader)
74
75     def suitable(self, url):
76         """Receives a URL and returns True if suitable for this IE."""
77         return re.match(self._VALID_URL, url) is not None
78
79     def working(self):
80         """Getter method for _WORKING."""
81         return self._WORKING
82
83     def initialize(self):
84         """Initializes an instance (authentication, etc)."""
85         if not self._ready:
86             self._real_initialize()
87             self._ready = True
88
89     def extract(self, url):
90         """Extracts URL information and returns it in list of dicts."""
91         self.initialize()
92         return self._real_extract(url)
93
94     def set_downloader(self, downloader):
95         """Sets the downloader for this IE."""
96         self._downloader = downloader
97
98     def _real_initialize(self):
99         """Real initialization process. Redefine in subclasses."""
100         pass
101
102     def _real_extract(self, url):
103         """Real extraction process. Redefine in subclasses."""
104         pass
105
106     @property
107     def IE_NAME(self):
108         return type(self).__name__[:-2]
109
110     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
111         """ Returns the response handle """
112         if note is None:
113             note = u'Downloading video webpage'
114         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
115         try:
116             return compat_urllib_request.urlopen(url_or_request)
117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118             if errnote is None:
119                 errnote = u'Unable to download webpage'
120             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
121
122     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
123         """ Returns the data of the page as a string """
124         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
125         webpage_bytes = urlh.read()
126         return webpage_bytes.decode('utf-8', 'replace')
127
128
129 class YoutubeIE(InfoExtractor):
130     """Information extractor for youtube.com."""
131
132     _VALID_URL = r"""^
133                      (
134                          (?:https?://)?                                       # http(s):// (optional)
135                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
136                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
137                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
138                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
139                          (?:                                                  # the various things that can precede the ID:
140                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
141                              |(?:                                             # or the v= param in all its forms
142                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
143                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
144                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
145                                  v=
146                              )
147                          )?                                                   # optional -> youtube.com/xxxx is OK
148                      )?                                                       # all until now is optional -> you can pass the naked ID
149                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
150                      (?(1).+)?                                                # if we found the ID, everything can follow
151                      $"""
152     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
153     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
154     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
155     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
156     _NETRC_MACHINE = 'youtube'
157     # Listed in order of quality
158     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
159     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
160     _video_extensions = {
161         '13': '3gp',
162         '17': 'mp4',
163         '18': 'mp4',
164         '22': 'mp4',
165         '37': 'mp4',
166         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
167         '43': 'webm',
168         '44': 'webm',
169         '45': 'webm',
170         '46': 'webm',
171     }
172     _video_dimensions = {
173         '5': '240x400',
174         '6': '???',
175         '13': '???',
176         '17': '144x176',
177         '18': '360x640',
178         '22': '720x1280',
179         '34': '360x640',
180         '35': '480x854',
181         '37': '1080x1920',
182         '38': '3072x4096',
183         '43': '360x640',
184         '44': '480x854',
185         '45': '720x1280',
186         '46': '1080x1920',
187     }
188     IE_NAME = u'youtube'
189
190     def suitable(self, url):
191         """Receives a URL and returns True if suitable for this IE."""
192         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
193
194     def report_lang(self):
195         """Report attempt to set language."""
196         self._downloader.to_screen(u'[youtube] Setting language')
197
198     def report_login(self):
199         """Report attempt to log in."""
200         self._downloader.to_screen(u'[youtube] Logging in')
201
202     def report_age_confirmation(self):
203         """Report attempt to confirm age."""
204         self._downloader.to_screen(u'[youtube] Confirming age')
205
206     def report_video_webpage_download(self, video_id):
207         """Report attempt to download video webpage."""
208         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
209
210     def report_video_info_webpage_download(self, video_id):
211         """Report attempt to download video info webpage."""
212         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
213
214     def report_video_subtitles_download(self, video_id):
215         """Report attempt to download video info webpage."""
216         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
217
218     def report_information_extraction(self, video_id):
219         """Report attempt to extract video information."""
220         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
221
222     def report_unavailable_format(self, video_id, format):
223         """Report extracted video URL."""
224         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
225
226     def report_rtmp_download(self):
227         """Indicate the download will use the RTMP protocol."""
228         self._downloader.to_screen(u'[youtube] RTMP download detected')
229
230     def _closed_captions_xml_to_srt(self, xml_string):
231         srt = ''
232         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
233         # TODO parse xml instead of regex
234         for n, (start, dur_tag, dur, caption) in enumerate(texts):
235             if not dur: dur = '4'
236             start = float(start)
237             end = start + float(dur)
238             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
239             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
240             caption = unescapeHTML(caption)
241             caption = unescapeHTML(caption) # double cycle, intentional
242             srt += str(n+1) + '\n'
243             srt += start + ' --> ' + end + '\n'
244             srt += caption + '\n\n'
245         return srt
246
247     def _extract_subtitles(self, video_id):
248         self.report_video_subtitles_download(video_id)
249         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
250         try:
251             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
252         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
253             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
254         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
255         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
256         if not srt_lang_list:
257             return (u'WARNING: video has no closed captions', None)
258         if self._downloader.params.get('subtitleslang', False):
259             srt_lang = self._downloader.params.get('subtitleslang')
260         elif 'en' in srt_lang_list:
261             srt_lang = 'en'
262         else:
263             srt_lang = list(srt_lang_list.keys())[0]
264         if not srt_lang in srt_lang_list:
265             return (u'WARNING: no closed captions found in the specified language', None)
266         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
267         try:
268             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
269         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
270             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
271         if not srt_xml:
272             return (u'WARNING: unable to download video subtitles', None)
273         return (None, self._closed_captions_xml_to_srt(srt_xml))
274
275     def _print_formats(self, formats):
276         print('Available formats:')
277         for x in formats:
278             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
279
280     def _real_initialize(self):
281         if self._downloader is None:
282             return
283
284         username = None
285         password = None
286         downloader_params = self._downloader.params
287
288         # Attempt to use provided username and password or .netrc data
289         if downloader_params.get('username', None) is not None:
290             username = downloader_params['username']
291             password = downloader_params['password']
292         elif downloader_params.get('usenetrc', False):
293             try:
294                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
295                 if info is not None:
296                     username = info[0]
297                     password = info[2]
298                 else:
299                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
300             except (IOError, netrc.NetrcParseError) as err:
301                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
302                 return
303
304         # Set language
305         request = compat_urllib_request.Request(self._LANG_URL)
306         try:
307             self.report_lang()
308             compat_urllib_request.urlopen(request).read()
309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
311             return
312
313         # No authentication to be performed
314         if username is None:
315             return
316
317         # Log in
318         login_form = {
319                 'current_form': 'loginForm',
320                 'next':     '/',
321                 'action_login': 'Log In',
322                 'username': username,
323                 'password': password,
324                 }
325         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
326         try:
327             self.report_login()
328             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
329             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
330                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
331                 return
332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
333             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
334             return
335
336         # Confirm age
337         age_form = {
338                 'next_url':     '/',
339                 'action_confirm':   'Confirm',
340                 }
341         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
342         try:
343             self.report_age_confirmation()
344             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
345         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
346             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
347             return
348
349     def _extract_id(self, url):
350         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
351         if mobj is None:
352             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
353             return
354         video_id = mobj.group(2)
355         return video_id
356
357     def _real_extract(self, url):
358         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
359         mobj = re.search(self._NEXT_URL_RE, url)
360         if mobj:
361             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
362         video_id = self._extract_id(url)
363
364         # Get video webpage
365         self.report_video_webpage_download(video_id)
366         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
367         request = compat_urllib_request.Request(url)
368         try:
369             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
372             return
373
374         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
375
376         # Attempt to extract SWF player URL
377         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
378         if mobj is not None:
379             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
380         else:
381             player_url = None
382
383         # Get video info
384         self.report_video_info_webpage_download(video_id)
385         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
386             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
387                     % (video_id, el_type))
388             request = compat_urllib_request.Request(video_info_url)
389             try:
390                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
391                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
392                 video_info = compat_parse_qs(video_info_webpage)
393                 if 'token' in video_info:
394                     break
395             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
397                 return
398         if 'token' not in video_info:
399             if 'reason' in video_info:
400                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
401             else:
402                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
403             return
404
405         # Check for "rental" videos
406         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
407             self._downloader.trouble(u'ERROR: "rental" videos not supported')
408             return
409
410         # Start extracting information
411         self.report_information_extraction(video_id)
412
413         # uploader
414         if 'author' not in video_info:
415             self._downloader.trouble(u'ERROR: unable to extract uploader name')
416             return
417         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
418
419         # uploader_id
420         video_uploader_id = None
421         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
422         if mobj is not None:
423             video_uploader_id = mobj.group(1)
424         else:
425             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
426
427         # title
428         if 'title' not in video_info:
429             self._downloader.trouble(u'ERROR: unable to extract video title')
430             return
431         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
432
433         # thumbnail image
434         if 'thumbnail_url' not in video_info:
435             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
436             video_thumbnail = ''
437         else:   # don't panic if we can't find it
438             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
439
440         # upload date
441         upload_date = None
442         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
443         if mobj is not None:
444             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
445             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
446             for expression in format_expressions:
447                 try:
448                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
449                 except:
450                     pass
451
452         # description
453         video_description = get_element_by_id("eow-description", video_webpage)
454         if video_description:
455             video_description = clean_html(video_description)
456         else:
457             video_description = ''
458
459         # closed captions
460         video_subtitles = None
461         if self._downloader.params.get('writesubtitles', False):
462             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
463             if srt_error:
464                 self._downloader.trouble(srt_error)
465
466         if 'length_seconds' not in video_info:
467             self._downloader.trouble(u'WARNING: unable to extract video duration')
468             video_duration = ''
469         else:
470             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
471
472         # token
473         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
474
475         # Decide which formats to download
476         req_format = self._downloader.params.get('format', None)
477
478         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
479             self.report_rtmp_download()
480             video_url_list = [(None, video_info['conn'][0])]
481         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
482             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
483             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
484             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
485             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
486
487             format_limit = self._downloader.params.get('format_limit', None)
488             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
489             if format_limit is not None and format_limit in available_formats:
490                 format_list = available_formats[available_formats.index(format_limit):]
491             else:
492                 format_list = available_formats
493             existing_formats = [x for x in format_list if x in url_map]
494             if len(existing_formats) == 0:
495                 self._downloader.trouble(u'ERROR: no known formats available for video')
496                 return
497             if self._downloader.params.get('listformats', None):
498                 self._print_formats(existing_formats)
499                 return
500             if req_format is None or req_format == 'best':
501                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
502             elif req_format == 'worst':
503                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
504             elif req_format in ('-1', 'all'):
505                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
506             else:
507                 # Specific formats. We pick the first in a slash-delimeted sequence.
508                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
509                 req_formats = req_format.split('/')
510                 video_url_list = None
511                 for rf in req_formats:
512                     if rf in url_map:
513                         video_url_list = [(rf, url_map[rf])]
514                         break
515                 if video_url_list is None:
516                     self._downloader.trouble(u'ERROR: requested format not available')
517                     return
518         else:
519             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
520             return
521
522         results = []
523         for format_param, video_real_url in video_url_list:
524             # Extension
525             video_extension = self._video_extensions.get(format_param, 'flv')
526
527             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
528                                               self._video_dimensions.get(format_param, '???'))
529
530             results.append({
531                 'id':       video_id,
532                 'url':      video_real_url,
533                 'uploader': video_uploader,
534                 'uploader_id': video_uploader_id,
535                 'upload_date':  upload_date,
536                 'title':    video_title,
537                 'ext':      video_extension,
538                 'format':   video_format,
539                 'thumbnail':    video_thumbnail,
540                 'description':  video_description,
541                 'player_url':   player_url,
542                 'subtitles':    video_subtitles,
543                 'duration':     video_duration
544             })
545         return results
546
547
548 class MetacafeIE(InfoExtractor):
549     """Information Extractor for metacafe.com."""
550
551     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
552     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
553     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
554     IE_NAME = u'metacafe'
555
556     def __init__(self, downloader=None):
557         InfoExtractor.__init__(self, downloader)
558
559     def report_disclaimer(self):
560         """Report disclaimer retrieval."""
561         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
562
563     def report_age_confirmation(self):
564         """Report attempt to confirm age."""
565         self._downloader.to_screen(u'[metacafe] Confirming age')
566
567     def report_download_webpage(self, video_id):
568         """Report webpage download."""
569         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
570
571     def report_extraction(self, video_id):
572         """Report information extraction."""
573         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
574
575     def _real_initialize(self):
576         # Retrieve disclaimer
577         request = compat_urllib_request.Request(self._DISCLAIMER)
578         try:
579             self.report_disclaimer()
580             disclaimer = compat_urllib_request.urlopen(request).read()
581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
583             return
584
585         # Confirm age
586         disclaimer_form = {
587             'filters': '0',
588             'submit': "Continue - I'm over 18",
589             }
590         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
591         try:
592             self.report_age_confirmation()
593             disclaimer = compat_urllib_request.urlopen(request).read()
594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
596             return
597
598     def _real_extract(self, url):
599         # Extract id and simplified title from URL
600         mobj = re.match(self._VALID_URL, url)
601         if mobj is None:
602             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
603             return
604
605         video_id = mobj.group(1)
606
607         # Check if video comes from YouTube
608         mobj2 = re.match(r'^yt-(.*)$', video_id)
609         if mobj2 is not None:
610             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
611             return
612
613         # Retrieve video webpage to extract further information
614         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
615         try:
616             self.report_download_webpage(video_id)
617             webpage = compat_urllib_request.urlopen(request).read()
618         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
619             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
620             return
621
622         # Extract URL, uploader and title from webpage
623         self.report_extraction(video_id)
624         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
625         if mobj is not None:
626             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
627             video_extension = mediaURL[-3:]
628
629             # Extract gdaKey if available
630             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
631             if mobj is None:
632                 video_url = mediaURL
633             else:
634                 gdaKey = mobj.group(1)
635                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
636         else:
637             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
638             if mobj is None:
639                 self._downloader.trouble(u'ERROR: unable to extract media URL')
640                 return
641             vardict = compat_parse_qs(mobj.group(1))
642             if 'mediaData' not in vardict:
643                 self._downloader.trouble(u'ERROR: unable to extract media URL')
644                 return
645             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
646             if mobj is None:
647                 self._downloader.trouble(u'ERROR: unable to extract media URL')
648                 return
649             mediaURL = mobj.group(1).replace('\\/', '/')
650             video_extension = mediaURL[-3:]
651             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
652
653         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
654         if mobj is None:
655             self._downloader.trouble(u'ERROR: unable to extract title')
656             return
657         video_title = mobj.group(1).decode('utf-8')
658
659         mobj = re.search(r'submitter=(.*?);', webpage)
660         if mobj is None:
661             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
662             return
663         video_uploader = mobj.group(1)
664
665         return [{
666             'id':       video_id.decode('utf-8'),
667             'url':      video_url.decode('utf-8'),
668             'uploader': video_uploader.decode('utf-8'),
669             'upload_date':  None,
670             'title':    video_title,
671             'ext':      video_extension.decode('utf-8'),
672         }]
673
674
675 class DailymotionIE(InfoExtractor):
676     """Information Extractor for Dailymotion"""
677
678     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
679     IE_NAME = u'dailymotion'
680
681     def __init__(self, downloader=None):
682         InfoExtractor.__init__(self, downloader)
683
684     def report_extraction(self, video_id):
685         """Report information extraction."""
686         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
687
688     def _real_extract(self, url):
689         # Extract id and simplified title from URL
690         mobj = re.match(self._VALID_URL, url)
691         if mobj is None:
692             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
693             return
694
695         video_id = mobj.group(1).split('_')[0].split('?')[0]
696
697         video_extension = 'mp4'
698
699         # Retrieve video webpage to extract further information
700         request = compat_urllib_request.Request(url)
701         request.add_header('Cookie', 'family_filter=off')
702         webpage = self._download_webpage(request, video_id)
703
704         # Extract URL, uploader and title from webpage
705         self.report_extraction(video_id)
706         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
707         if mobj is None:
708             self._downloader.trouble(u'ERROR: unable to extract media URL')
709             return
710         flashvars = compat_urllib_parse.unquote(mobj.group(1))
711
712         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
713             if key in flashvars:
714                 max_quality = key
715                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
716                 break
717         else:
718             self._downloader.trouble(u'ERROR: unable to extract video URL')
719             return
720
721         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
722         if mobj is None:
723             self._downloader.trouble(u'ERROR: unable to extract video URL')
724             return
725
726         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
727
728         # TODO: support choosing qualities
729
730         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
731         if mobj is None:
732             self._downloader.trouble(u'ERROR: unable to extract title')
733             return
734         video_title = unescapeHTML(mobj.group('title'))
735
736         video_uploader = None
737         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
738         if mobj is None:
739             # lookin for official user
740             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
741             if mobj_official is None:
742                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
743             else:
744                 video_uploader = mobj_official.group(1)
745         else:
746             video_uploader = mobj.group(1)
747
748         video_upload_date = None
749         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
750         if mobj is not None:
751             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
752
753         return [{
754             'id':       video_id,
755             'url':      video_url,
756             'uploader': video_uploader,
757             'upload_date':  video_upload_date,
758             'title':    video_title,
759             'ext':      video_extension,
760         }]
761
762
763 class PhotobucketIE(InfoExtractor):
764     """Information extractor for photobucket.com."""
765
766     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767     IE_NAME = u'photobucket'
768
769     def __init__(self, downloader=None):
770         InfoExtractor.__init__(self, downloader)
771
772     def report_download_webpage(self, video_id):
773         """Report webpage download."""
774         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
775
776     def report_extraction(self, video_id):
777         """Report information extraction."""
778         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
779
780     def _real_extract(self, url):
781         # Extract id from URL
782         mobj = re.match(self._VALID_URL, url)
783         if mobj is None:
784             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
785             return
786
787         video_id = mobj.group(1)
788
789         video_extension = 'flv'
790
791         # Retrieve video webpage to extract further information
792         request = compat_urllib_request.Request(url)
793         try:
794             self.report_download_webpage(video_id)
795             webpage = compat_urllib_request.urlopen(request).read()
796         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
797             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
798             return
799
800         # Extract URL, uploader, and title from webpage
801         self.report_extraction(video_id)
802         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
803         if mobj is None:
804             self._downloader.trouble(u'ERROR: unable to extract media URL')
805             return
806         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
807
808         video_url = mediaURL
809
810         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
811         if mobj is None:
812             self._downloader.trouble(u'ERROR: unable to extract title')
813             return
814         video_title = mobj.group(1).decode('utf-8')
815
816         video_uploader = mobj.group(2).decode('utf-8')
817
818         return [{
819             'id':       video_id.decode('utf-8'),
820             'url':      video_url.decode('utf-8'),
821             'uploader': video_uploader,
822             'upload_date':  None,
823             'title':    video_title,
824             'ext':      video_extension.decode('utf-8'),
825         }]
826
827
828 class YahooIE(InfoExtractor):
829     """Information extractor for video.yahoo.com."""
830
831     _WORKING = False
832     # _VALID_URL matches all Yahoo! Video URLs
833     # _VPAGE_URL matches only the extractable '/watch/' URLs
834     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
835     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
836     IE_NAME = u'video.yahoo'
837
838     def __init__(self, downloader=None):
839         InfoExtractor.__init__(self, downloader)
840
841     def report_download_webpage(self, video_id):
842         """Report webpage download."""
843         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
844
845     def report_extraction(self, video_id):
846         """Report information extraction."""
847         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
848
849     def _real_extract(self, url, new_video=True):
850         # Extract ID from URL
851         mobj = re.match(self._VALID_URL, url)
852         if mobj is None:
853             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
854             return
855
856         video_id = mobj.group(2)
857         video_extension = 'flv'
858
859         # Rewrite valid but non-extractable URLs as
860         # extractable English language /watch/ URLs
861         if re.match(self._VPAGE_URL, url) is None:
862             request = compat_urllib_request.Request(url)
863             try:
864                 webpage = compat_urllib_request.urlopen(request).read()
865             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
866                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
867                 return
868
869             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
870             if mobj is None:
871                 self._downloader.trouble(u'ERROR: Unable to extract id field')
872                 return
873             yahoo_id = mobj.group(1)
874
875             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
876             if mobj is None:
877                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
878                 return
879             yahoo_vid = mobj.group(1)
880
881             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
882             return self._real_extract(url, new_video=False)
883
884         # Retrieve video webpage to extract further information
885         request = compat_urllib_request.Request(url)
886         try:
887             self.report_download_webpage(video_id)
888             webpage = compat_urllib_request.urlopen(request).read()
889         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
890             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
891             return
892
893         # Extract uploader and title from webpage
894         self.report_extraction(video_id)
895         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
896         if mobj is None:
897             self._downloader.trouble(u'ERROR: unable to extract video title')
898             return
899         video_title = mobj.group(1).decode('utf-8')
900
901         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
902         if mobj is None:
903             self._downloader.trouble(u'ERROR: unable to extract video uploader')
904             return
905         video_uploader = mobj.group(1).decode('utf-8')
906
907         # Extract video thumbnail
908         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
909         if mobj is None:
910             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
911             return
912         video_thumbnail = mobj.group(1).decode('utf-8')
913
914         # Extract video description
915         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
916         if mobj is None:
917             self._downloader.trouble(u'ERROR: unable to extract video description')
918             return
919         video_description = mobj.group(1).decode('utf-8')
920         if not video_description:
921             video_description = 'No description available.'
922
923         # Extract video height and width
924         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
925         if mobj is None:
926             self._downloader.trouble(u'ERROR: unable to extract video height')
927             return
928         yv_video_height = mobj.group(1)
929
930         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
931         if mobj is None:
932             self._downloader.trouble(u'ERROR: unable to extract video width')
933             return
934         yv_video_width = mobj.group(1)
935
936         # Retrieve video playlist to extract media URL
937         # I'm not completely sure what all these options are, but we
938         # seem to need most of them, otherwise the server sends a 401.
939         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
940         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
941         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
942                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
943                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
944         try:
945             self.report_download_webpage(video_id)
946             webpage = compat_urllib_request.urlopen(request).read()
947         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
948             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
949             return
950
951         # Extract media URL from playlist XML
952         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
953         if mobj is None:
954             self._downloader.trouble(u'ERROR: Unable to extract media URL')
955             return
956         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
957         video_url = unescapeHTML(video_url)
958
959         return [{
960             'id':       video_id.decode('utf-8'),
961             'url':      video_url,
962             'uploader': video_uploader,
963             'upload_date':  None,
964             'title':    video_title,
965             'ext':      video_extension.decode('utf-8'),
966             'thumbnail':    video_thumbnail.decode('utf-8'),
967             'description':  video_description,
968         }]
969
970
971 class VimeoIE(InfoExtractor):
972     """Information extractor for vimeo.com."""
973
974     # _VALID_URL matches Vimeo URLs
975     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
976     IE_NAME = u'vimeo'
977
978     def __init__(self, downloader=None):
979         InfoExtractor.__init__(self, downloader)
980
981     def report_download_webpage(self, video_id):
982         """Report webpage download."""
983         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
984
985     def report_extraction(self, video_id):
986         """Report information extraction."""
987         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
988
989     def _real_extract(self, url, new_video=True):
990         # Extract ID from URL
991         mobj = re.match(self._VALID_URL, url)
992         if mobj is None:
993             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
994             return
995
996         video_id = mobj.group(1)
997
998         # Retrieve video webpage to extract further information
999         request = compat_urllib_request.Request(url, None, std_headers)
1000         try:
1001             self.report_download_webpage(video_id)
1002             webpage_bytes = compat_urllib_request.urlopen(request).read()
1003             webpage = webpage_bytes.decode('utf-8')
1004         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1006             return
1007
1008         # Now we begin extracting as much information as we can from what we
1009         # retrieved. First we extract the information common to all extractors,
1010         # and latter we extract those that are Vimeo specific.
1011         self.report_extraction(video_id)
1012
1013         # Extract the config JSON
1014         try:
1015             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1016             config = json.loads(config)
1017         except:
1018             self._downloader.trouble(u'ERROR: unable to extract info section')
1019             return
1020
1021         # Extract title
1022         video_title = config["video"]["title"]
1023
1024         # Extract uploader and uploader_id
1025         video_uploader = config["video"]["owner"]["name"]
1026         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1027
1028         # Extract video thumbnail
1029         video_thumbnail = config["video"]["thumbnail"]
1030
1031         # Extract video description
1032         video_description = get_element_by_attribute("itemprop", "description", webpage)
1033         if video_description: video_description = clean_html(video_description)
1034         else: video_description = ''
1035
1036         # Extract upload date
1037         video_upload_date = None
1038         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1039         if mobj is not None:
1040             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1041
1042         # Vimeo specific: extract request signature and timestamp
1043         sig = config['request']['signature']
1044         timestamp = config['request']['timestamp']
1045
1046         # Vimeo specific: extract video codec and quality information
1047         # First consider quality, then codecs, then take everything
1048         # TODO bind to format param
1049         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050         files = { 'hd': [], 'sd': [], 'other': []}
1051         for codec_name, codec_extension in codecs:
1052             if codec_name in config["video"]["files"]:
1053                 if 'hd' in config["video"]["files"][codec_name]:
1054                     files['hd'].append((codec_name, codec_extension, 'hd'))
1055                 elif 'sd' in config["video"]["files"][codec_name]:
1056                     files['sd'].append((codec_name, codec_extension, 'sd'))
1057                 else:
1058                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1059
1060         for quality in ('hd', 'sd', 'other'):
1061             if len(files[quality]) > 0:
1062                 video_quality = files[quality][0][2]
1063                 video_codec = files[quality][0][0]
1064                 video_extension = files[quality][0][1]
1065                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1066                 break
1067         else:
1068             self._downloader.trouble(u'ERROR: no known codec found')
1069             return
1070
1071         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1072                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1073
1074         return [{
1075             'id':       video_id,
1076             'url':      video_url,
1077             'uploader': video_uploader,
1078             'uploader_id': video_uploader_id,
1079             'upload_date':  video_upload_date,
1080             'title':    video_title,
1081             'ext':      video_extension,
1082             'thumbnail':    video_thumbnail,
1083             'description':  video_description,
1084         }]
1085
1086
1087 class ArteTvIE(InfoExtractor):
1088     """arte.tv information extractor."""
1089
1090     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1091     _LIVE_URL = r'index-[0-9]+\.html$'
1092
1093     IE_NAME = u'arte.tv'
1094
1095     def __init__(self, downloader=None):
1096         InfoExtractor.__init__(self, downloader)
1097
1098     def report_download_webpage(self, video_id):
1099         """Report webpage download."""
1100         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1101
1102     def report_extraction(self, video_id):
1103         """Report information extraction."""
1104         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1105
1106     def fetch_webpage(self, url):
1107         request = compat_urllib_request.Request(url)
1108         try:
1109             self.report_download_webpage(url)
1110             webpage = compat_urllib_request.urlopen(request).read()
1111         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1112             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1113             return
1114         except ValueError as err:
1115             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1116             return
1117         return webpage
1118
1119     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1120         page = self.fetch_webpage(url)
1121         mobj = re.search(regex, page, regexFlags)
1122         info = {}
1123
1124         if mobj is None:
1125             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1126             return
1127
1128         for (i, key, err) in matchTuples:
1129             if mobj.group(i) is None:
1130                 self._downloader.trouble(err)
1131                 return
1132             else:
1133                 info[key] = mobj.group(i)
1134
1135         return info
1136
1137     def extractLiveStream(self, url):
1138         video_lang = url.split('/')[-4]
1139         info = self.grep_webpage(
1140             url,
1141             r'src="(.*?/videothek_js.*?\.js)',
1142             0,
1143             [
1144                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1145             ]
1146         )
1147         http_host = url.split('/')[2]
1148         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1149         info = self.grep_webpage(
1150             next_url,
1151             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1152                 '(http://.*?\.swf).*?' +
1153                 '(rtmp://.*?)\'',
1154             re.DOTALL,
1155             [
1156                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1157                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1158                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1159             ]
1160         )
1161         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1162
1163     def extractPlus7Stream(self, url):
1164         video_lang = url.split('/')[-3]
1165         info = self.grep_webpage(
1166             url,
1167             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1168             0,
1169             [
1170                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1171             ]
1172         )
1173         next_url = compat_urllib_parse.unquote(info.get('url'))
1174         info = self.grep_webpage(
1175             next_url,
1176             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1177             0,
1178             [
1179                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1180             ]
1181         )
1182         next_url = compat_urllib_parse.unquote(info.get('url'))
1183
1184         info = self.grep_webpage(
1185             next_url,
1186             r'<video id="(.*?)".*?>.*?' +
1187                 '<name>(.*?)</name>.*?' +
1188                 '<dateVideo>(.*?)</dateVideo>.*?' +
1189                 '<url quality="hd">(.*?)</url>',
1190             re.DOTALL,
1191             [
1192                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1193                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1194                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1195                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1196             ]
1197         )
1198
1199         return {
1200             'id':           info.get('id'),
1201             'url':          compat_urllib_parse.unquote(info.get('url')),
1202             'uploader':     u'arte.tv',
1203             'upload_date':  info.get('date'),
1204             'title':        info.get('title').decode('utf-8'),
1205             'ext':          u'mp4',
1206             'format':       u'NA',
1207             'player_url':   None,
1208         }
1209
1210     def _real_extract(self, url):
1211         video_id = url.split('/')[-1]
1212         self.report_extraction(video_id)
1213
1214         if re.search(self._LIVE_URL, video_id) is not None:
1215             self.extractLiveStream(url)
1216             return
1217         else:
1218             info = self.extractPlus7Stream(url)
1219
1220         return [info]
1221
1222
1223 class GenericIE(InfoExtractor):
1224     """Generic last-resort information extractor."""
1225
1226     _VALID_URL = r'.*'
1227     IE_NAME = u'generic'
1228
1229     def __init__(self, downloader=None):
1230         InfoExtractor.__init__(self, downloader)
1231
1232     def report_download_webpage(self, video_id):
1233         """Report webpage download."""
1234         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1235         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1236
1237     def report_extraction(self, video_id):
1238         """Report information extraction."""
1239         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1240
1241     def report_following_redirect(self, new_url):
1242         """Report information extraction."""
1243         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1244
1245     def _test_redirect(self, url):
1246         """Check if it is a redirect, like url shorteners, in case restart chain."""
1247         class HeadRequest(compat_urllib_request.Request):
1248             def get_method(self):
1249                 return "HEAD"
1250
1251         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1252             """
1253             Subclass the HTTPRedirectHandler to make it use our
1254             HeadRequest also on the redirected URL
1255             """
1256             def redirect_request(self, req, fp, code, msg, headers, newurl):
1257                 if code in (301, 302, 303, 307):
1258                     newurl = newurl.replace(' ', '%20')
1259                     newheaders = dict((k,v) for k,v in req.headers.items()
1260                                       if k.lower() not in ("content-length", "content-type"))
1261                     return HeadRequest(newurl,
1262                                        headers=newheaders,
1263                                        origin_req_host=req.get_origin_req_host(),
1264                                        unverifiable=True)
1265                 else:
1266                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1267
1268         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1269             """
1270             Fallback to GET if HEAD is not allowed (405 HTTP error)
1271             """
1272             def http_error_405(self, req, fp, code, msg, headers):
1273                 fp.read()
1274                 fp.close()
1275
1276                 newheaders = dict((k,v) for k,v in req.headers.items()
1277                                   if k.lower() not in ("content-length", "content-type"))
1278                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1279                                                  headers=newheaders,
1280                                                  origin_req_host=req.get_origin_req_host(),
1281                                                  unverifiable=True))
1282
1283         # Build our opener
1284         opener = compat_urllib_request.OpenerDirector()
1285         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1286                         HTTPMethodFallback, HEADRedirectHandler,
1287                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1288             opener.add_handler(handler())
1289
1290         response = opener.open(HeadRequest(url))
1291         new_url = response.geturl()
1292
1293         if url == new_url:
1294             return False
1295
1296         self.report_following_redirect(new_url)
1297         self._downloader.download([new_url])
1298         return True
1299
1300     def _real_extract(self, url):
1301         if self._test_redirect(url): return
1302
1303         video_id = url.split('/')[-1]
1304         request = compat_urllib_request.Request(url)
1305         try:
1306             self.report_download_webpage(video_id)
1307             webpage = compat_urllib_request.urlopen(request).read()
1308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1309             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1310             return
1311         except ValueError as err:
1312             # since this is the last-resort InfoExtractor, if
1313             # this error is thrown, it'll be thrown here
1314             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315             return
1316
1317         self.report_extraction(video_id)
1318         # Start with something easy: JW Player in SWFObject
1319         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1320         if mobj is None:
1321             # Broaden the search a little bit
1322             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1323         if mobj is None:
1324             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325             return
1326
1327         # It's possible that one of the regexes
1328         # matched, but returned an empty group:
1329         if mobj.group(1) is None:
1330             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1331             return
1332
1333         video_url = compat_urllib_parse.unquote(mobj.group(1))
1334         video_id = os.path.basename(video_url)
1335
1336         # here's a fun little line of code for you:
1337         video_extension = os.path.splitext(video_id)[1][1:]
1338         video_id = os.path.splitext(video_id)[0]
1339
1340         # it's tempting to parse this further, but you would
1341         # have to take into account all the variations like
1342         #   Video Title - Site Name
1343         #   Site Name | Video Title
1344         #   Video Title - Tagline | Site Name
1345         # and so on and so forth; it's just not practical
1346         mobj = re.search(r'<title>(.*)</title>', webpage)
1347         if mobj is None:
1348             self._downloader.trouble(u'ERROR: unable to extract title')
1349             return
1350         video_title = mobj.group(1)
1351
1352         # video uploader is domain name
1353         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1354         if mobj is None:
1355             self._downloader.trouble(u'ERROR: unable to extract title')
1356             return
1357         video_uploader = mobj.group(1)
1358
1359         return [{
1360             'id':       video_id,
1361             'url':      video_url,
1362             'uploader': video_uploader,
1363             'upload_date':  None,
1364             'title':    video_title,
1365             'ext':      video_extension,
1366         }]
1367
1368
1369 class YoutubeSearchIE(InfoExtractor):
1370     """Information Extractor for YouTube search queries."""
1371     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1372     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1373     _max_youtube_results = 1000
1374     IE_NAME = u'youtube:search'
1375
1376     def __init__(self, downloader=None):
1377         InfoExtractor.__init__(self, downloader)
1378
1379     def report_download_page(self, query, pagenum):
1380         """Report attempt to download search page with given number."""
1381         query = query.decode(preferredencoding())
1382         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1383
1384     def _real_extract(self, query):
1385         mobj = re.match(self._VALID_URL, query)
1386         if mobj is None:
1387             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1388             return
1389
1390         prefix, query = query.split(':')
1391         prefix = prefix[8:]
1392         query = query.encode('utf-8')
1393         if prefix == '':
1394             self._download_n_results(query, 1)
1395             return
1396         elif prefix == 'all':
1397             self._download_n_results(query, self._max_youtube_results)
1398             return
1399         else:
1400             try:
1401                 n = int(prefix)
1402                 if n <= 0:
1403                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1404                     return
1405                 elif n > self._max_youtube_results:
1406                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1407                     n = self._max_youtube_results
1408                 self._download_n_results(query, n)
1409                 return
1410             except ValueError: # parsing prefix as integer fails
1411                 self._download_n_results(query, 1)
1412                 return
1413
1414     def _download_n_results(self, query, n):
1415         """Downloads a specified number of results for a query"""
1416
1417         video_ids = []
1418         pagenum = 0
1419         limit = n
1420
1421         while (50 * pagenum) < limit:
1422             self.report_download_page(query, pagenum+1)
1423             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1424             request = compat_urllib_request.Request(result_url)
1425             try:
1426                 data = compat_urllib_request.urlopen(request).read()
1427             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1429                 return
1430             api_response = json.loads(data)['data']
1431
1432             new_ids = list(video['id'] for video in api_response['items'])
1433             video_ids += new_ids
1434
1435             limit = min(n, api_response['totalItems'])
1436             pagenum += 1
1437
1438         if len(video_ids) > n:
1439             video_ids = video_ids[:n]
1440         for id in video_ids:
1441             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1442         return
1443
1444
1445 class GoogleSearchIE(InfoExtractor):
1446     """Information Extractor for Google Video search queries."""
1447     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1448     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1449     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1450     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1451     _max_google_results = 1000
1452     IE_NAME = u'video.google:search'
1453
1454     def __init__(self, downloader=None):
1455         InfoExtractor.__init__(self, downloader)
1456
1457     def report_download_page(self, query, pagenum):
1458         """Report attempt to download playlist page with given number."""
1459         query = query.decode(preferredencoding())
1460         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1461
1462     def _real_extract(self, query):
1463         mobj = re.match(self._VALID_URL, query)
1464         if mobj is None:
1465             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1466             return
1467
1468         prefix, query = query.split(':')
1469         prefix = prefix[8:]
1470         query = query.encode('utf-8')
1471         if prefix == '':
1472             self._download_n_results(query, 1)
1473             return
1474         elif prefix == 'all':
1475             self._download_n_results(query, self._max_google_results)
1476             return
1477         else:
1478             try:
1479                 n = int(prefix)
1480                 if n <= 0:
1481                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1482                     return
1483                 elif n > self._max_google_results:
1484                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1485                     n = self._max_google_results
1486                 self._download_n_results(query, n)
1487                 return
1488             except ValueError: # parsing prefix as integer fails
1489                 self._download_n_results(query, 1)
1490                 return
1491
1492     def _download_n_results(self, query, n):
1493         """Downloads a specified number of results for a query"""
1494
1495         video_ids = []
1496         pagenum = 0
1497
1498         while True:
1499             self.report_download_page(query, pagenum)
1500             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1501             request = compat_urllib_request.Request(result_url)
1502             try:
1503                 page = compat_urllib_request.urlopen(request).read()
1504             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1506                 return
1507
1508             # Extract video identifiers
1509             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1510                 video_id = mobj.group(1)
1511                 if video_id not in video_ids:
1512                     video_ids.append(video_id)
1513                     if len(video_ids) == n:
1514                         # Specified n videos reached
1515                         for id in video_ids:
1516                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1517                         return
1518
1519             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520                 for id in video_ids:
1521                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1522                 return
1523
1524             pagenum = pagenum + 1
1525
1526
1527 class YahooSearchIE(InfoExtractor):
1528     """Information Extractor for Yahoo! Video search queries."""
1529
1530     _WORKING = False
1531     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1532     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1533     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1534     _MORE_PAGES_INDICATOR = r'\s*Next'
1535     _max_yahoo_results = 1000
1536     IE_NAME = u'video.yahoo:search'
1537
1538     def __init__(self, downloader=None):
1539         InfoExtractor.__init__(self, downloader)
1540
1541     def report_download_page(self, query, pagenum):
1542         """Report attempt to download playlist page with given number."""
1543         query = query.decode(preferredencoding())
1544         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1545
1546     def _real_extract(self, query):
1547         mobj = re.match(self._VALID_URL, query)
1548         if mobj is None:
1549             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1550             return
1551
1552         prefix, query = query.split(':')
1553         prefix = prefix[8:]
1554         query = query.encode('utf-8')
1555         if prefix == '':
1556             self._download_n_results(query, 1)
1557             return
1558         elif prefix == 'all':
1559             self._download_n_results(query, self._max_yahoo_results)
1560             return
1561         else:
1562             try:
1563                 n = int(prefix)
1564                 if n <= 0:
1565                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1566                     return
1567                 elif n > self._max_yahoo_results:
1568                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1569                     n = self._max_yahoo_results
1570                 self._download_n_results(query, n)
1571                 return
1572             except ValueError: # parsing prefix as integer fails
1573                 self._download_n_results(query, 1)
1574                 return
1575
1576     def _download_n_results(self, query, n):
1577         """Downloads a specified number of results for a query"""
1578
1579         video_ids = []
1580         already_seen = set()
1581         pagenum = 1
1582
1583         while True:
1584             self.report_download_page(query, pagenum)
1585             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1586             request = compat_urllib_request.Request(result_url)
1587             try:
1588                 page = compat_urllib_request.urlopen(request).read()
1589             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1590                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1591                 return
1592
1593             # Extract video identifiers
1594             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595                 video_id = mobj.group(1)
1596                 if video_id not in already_seen:
1597                     video_ids.append(video_id)
1598                     already_seen.add(video_id)
1599                     if len(video_ids) == n:
1600                         # Specified n videos reached
1601                         for id in video_ids:
1602                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1603                         return
1604
1605             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1606                 for id in video_ids:
1607                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608                 return
1609
1610             pagenum = pagenum + 1
1611
1612
1613 class YoutubePlaylistIE(InfoExtractor):
1614     """Information Extractor for YouTube playlists."""
1615
1616     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1617     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1618     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1619     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1620     IE_NAME = u'youtube:playlist'
1621
1622     def __init__(self, downloader=None):
1623         InfoExtractor.__init__(self, downloader)
1624
1625     def report_download_page(self, playlist_id, pagenum):
1626         """Report attempt to download playlist page with given number."""
1627         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1628
1629     def _real_extract(self, url):
1630         # Extract playlist id
1631         mobj = re.match(self._VALID_URL, url)
1632         if mobj is None:
1633             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1634             return
1635
1636         # Single video case
1637         if mobj.group(3) is not None:
1638             self._downloader.download([mobj.group(3)])
1639             return
1640
1641         # Download playlist pages
1642         # prefix is 'p' as default for playlists but there are other types that need extra care
1643         playlist_prefix = mobj.group(1)
1644         if playlist_prefix == 'a':
1645             playlist_access = 'artist'
1646         else:
1647             playlist_prefix = 'p'
1648             playlist_access = 'view_play_list'
1649         playlist_id = mobj.group(2)
1650         video_ids = []
1651         pagenum = 1
1652
1653         while True:
1654             self.report_download_page(playlist_id, pagenum)
1655             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1656             request = compat_urllib_request.Request(url)
1657             try:
1658                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1659             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1660                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1661                 return
1662
1663             # Extract video identifiers
1664             ids_in_page = []
1665             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1666                 if mobj.group(1) not in ids_in_page:
1667                     ids_in_page.append(mobj.group(1))
1668             video_ids.extend(ids_in_page)
1669
1670             if self._MORE_PAGES_INDICATOR not in page:
1671                 break
1672             pagenum = pagenum + 1
1673
1674         total = len(video_ids)
1675
1676         playliststart = self._downloader.params.get('playliststart', 1) - 1
1677         playlistend = self._downloader.params.get('playlistend', -1)
1678         if playlistend == -1:
1679             video_ids = video_ids[playliststart:]
1680         else:
1681             video_ids = video_ids[playliststart:playlistend]
1682
1683         if len(video_ids) == total:
1684             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1685         else:
1686             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1687
1688         for id in video_ids:
1689             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1690         return
1691
1692
1693 class YoutubeChannelIE(InfoExtractor):
1694     """Information Extractor for YouTube channels."""
1695
1696     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1697     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1698     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1699     IE_NAME = u'youtube:channel'
1700
1701     def report_download_page(self, channel_id, pagenum):
1702         """Report attempt to download channel page with given number."""
1703         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1704
1705     def _real_extract(self, url):
1706         # Extract channel id
1707         mobj = re.match(self._VALID_URL, url)
1708         if mobj is None:
1709             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710             return
1711
1712         # Download channel pages
1713         channel_id = mobj.group(1)
1714         video_ids = []
1715         pagenum = 1
1716
1717         while True:
1718             self.report_download_page(channel_id, pagenum)
1719             url = self._TEMPLATE_URL % (channel_id, pagenum)
1720             request = compat_urllib_request.Request(url)
1721             try:
1722                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725                 return
1726
1727             # Extract video identifiers
1728             ids_in_page = []
1729             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1730                 if mobj.group(1) not in ids_in_page:
1731                     ids_in_page.append(mobj.group(1))
1732             video_ids.extend(ids_in_page)
1733
1734             if self._MORE_PAGES_INDICATOR not in page:
1735                 break
1736             pagenum = pagenum + 1
1737
1738         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1739
1740         for id in video_ids:
1741             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1742         return
1743
1744
1745 class YoutubeUserIE(InfoExtractor):
1746     """Information Extractor for YouTube users."""
1747
1748     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1749     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1750     _GDATA_PAGE_SIZE = 50
1751     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1752     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1753     IE_NAME = u'youtube:user'
1754
1755     def __init__(self, downloader=None):
1756         InfoExtractor.__init__(self, downloader)
1757
1758     def report_download_page(self, username, start_index):
1759         """Report attempt to download user page."""
1760         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1761                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1762
1763     def _real_extract(self, url):
1764         # Extract username
1765         mobj = re.match(self._VALID_URL, url)
1766         if mobj is None:
1767             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1768             return
1769
1770         username = mobj.group(1)
1771
1772         # Download video ids using YouTube Data API. Result size per
1773         # query is limited (currently to 50 videos) so we need to query
1774         # page by page until there are no video ids - it means we got
1775         # all of them.
1776
1777         video_ids = []
1778         pagenum = 0
1779
1780         while True:
1781             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1782             self.report_download_page(username, start_index)
1783
1784             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1785
1786             try:
1787                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1788             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1790                 return
1791
1792             # Extract video identifiers
1793             ids_in_page = []
1794
1795             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796                 if mobj.group(1) not in ids_in_page:
1797                     ids_in_page.append(mobj.group(1))
1798
1799             video_ids.extend(ids_in_page)
1800
1801             # A little optimization - if current page is not
1802             # "full", ie. does not contain PAGE_SIZE video ids then
1803             # we can assume that this page is the last one - there
1804             # are no more ids on further pages - no need to query
1805             # again.
1806
1807             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1808                 break
1809
1810             pagenum += 1
1811
1812         all_ids_count = len(video_ids)
1813         playliststart = self._downloader.params.get('playliststart', 1) - 1
1814         playlistend = self._downloader.params.get('playlistend', -1)
1815
1816         if playlistend == -1:
1817             video_ids = video_ids[playliststart:]
1818         else:
1819             video_ids = video_ids[playliststart:playlistend]
1820
1821         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1822                 (username, all_ids_count, len(video_ids)))
1823
1824         for video_id in video_ids:
1825             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1826
1827
1828 class BlipTVUserIE(InfoExtractor):
1829     """Information Extractor for blip.tv users."""
1830
1831     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1832     _PAGE_SIZE = 12
1833     IE_NAME = u'blip.tv:user'
1834
1835     def __init__(self, downloader=None):
1836         InfoExtractor.__init__(self, downloader)
1837
1838     def report_download_page(self, username, pagenum):
1839         """Report attempt to download user page."""
1840         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1841                 (self.IE_NAME, username, pagenum))
1842
1843     def _real_extract(self, url):
1844         # Extract username
1845         mobj = re.match(self._VALID_URL, url)
1846         if mobj is None:
1847             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848             return
1849
1850         username = mobj.group(1)
1851
1852         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1853
1854         request = compat_urllib_request.Request(url)
1855
1856         try:
1857             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1858             mobj = re.search(r'data-users-id="([^"]+)"', page)
1859             page_base = page_base % mobj.group(1)
1860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1862             return
1863
1864
1865         # Download video ids using BlipTV Ajax calls. Result size per
1866         # query is limited (currently to 12 videos) so we need to query
1867         # page by page until there are no video ids - it means we got
1868         # all of them.
1869
1870         video_ids = []
1871         pagenum = 1
1872
1873         while True:
1874             self.report_download_page(username, pagenum)
1875
1876             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1877
1878             try:
1879                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1880             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1881                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1882                 return
1883
1884             # Extract video identifiers
1885             ids_in_page = []
1886
1887             for mobj in re.finditer(r'href="/([^"]+)"', page):
1888                 if mobj.group(1) not in ids_in_page:
1889                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1890
1891             video_ids.extend(ids_in_page)
1892
1893             # A little optimization - if current page is not
1894             # "full", ie. does not contain PAGE_SIZE video ids then
1895             # we can assume that this page is the last one - there
1896             # are no more ids on further pages - no need to query
1897             # again.
1898
1899             if len(ids_in_page) < self._PAGE_SIZE:
1900                 break
1901
1902             pagenum += 1
1903
1904         all_ids_count = len(video_ids)
1905         playliststart = self._downloader.params.get('playliststart', 1) - 1
1906         playlistend = self._downloader.params.get('playlistend', -1)
1907
1908         if playlistend == -1:
1909             video_ids = video_ids[playliststart:]
1910         else:
1911             video_ids = video_ids[playliststart:playlistend]
1912
1913         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1914                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1915
1916         for video_id in video_ids:
1917             self._downloader.download([u'http://blip.tv/'+video_id])
1918
1919
1920 class DepositFilesIE(InfoExtractor):
1921     """Information extractor for depositfiles.com"""
1922
1923     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1924
1925     def report_download_webpage(self, file_id):
1926         """Report webpage download."""
1927         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1928
1929     def report_extraction(self, file_id):
1930         """Report information extraction."""
1931         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1932
1933     def _real_extract(self, url):
1934         file_id = url.split('/')[-1]
1935         # Rebuild url in english locale
1936         url = 'http://depositfiles.com/en/files/' + file_id
1937
1938         # Retrieve file webpage with 'Free download' button pressed
1939         free_download_indication = { 'gateway_result' : '1' }
1940         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1941         try:
1942             self.report_download_webpage(file_id)
1943             webpage = compat_urllib_request.urlopen(request).read()
1944         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1945             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1946             return
1947
1948         # Search for the real file URL
1949         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1950         if (mobj is None) or (mobj.group(1) is None):
1951             # Try to figure out reason of the error.
1952             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1953             if (mobj is not None) and (mobj.group(1) is not None):
1954                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1955                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1956             else:
1957                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1958             return
1959
1960         file_url = mobj.group(1)
1961         file_extension = os.path.splitext(file_url)[1][1:]
1962
1963         # Search for file title
1964         mobj = re.search(r'<b title="(.*?)">', webpage)
1965         if mobj is None:
1966             self._downloader.trouble(u'ERROR: unable to extract title')
1967             return
1968         file_title = mobj.group(1).decode('utf-8')
1969
1970         return [{
1971             'id':       file_id.decode('utf-8'),
1972             'url':      file_url.decode('utf-8'),
1973             'uploader': None,
1974             'upload_date':  None,
1975             'title':    file_title,
1976             'ext':      file_extension.decode('utf-8'),
1977         }]
1978
1979
1980 class FacebookIE(InfoExtractor):
1981     """Information Extractor for Facebook"""
1982
1983     _WORKING = False
1984     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986     _NETRC_MACHINE = 'facebook'
1987     _available_formats = ['video', 'highqual', 'lowqual']
1988     _video_extensions = {
1989         'video': 'mp4',
1990         'highqual': 'mp4',
1991         'lowqual': 'mp4',
1992     }
1993     IE_NAME = u'facebook'
1994
1995     def __init__(self, downloader=None):
1996         InfoExtractor.__init__(self, downloader)
1997
1998     def _reporter(self, message):
1999         """Add header and report message."""
2000         self._downloader.to_screen(u'[facebook] %s' % message)
2001
2002     def report_login(self):
2003         """Report attempt to log in."""
2004         self._reporter(u'Logging in')
2005
2006     def report_video_webpage_download(self, video_id):
2007         """Report attempt to download video webpage."""
2008         self._reporter(u'%s: Downloading video webpage' % video_id)
2009
2010     def report_information_extraction(self, video_id):
2011         """Report attempt to extract video information."""
2012         self._reporter(u'%s: Extracting video information' % video_id)
2013
2014     def _parse_page(self, video_webpage):
2015         """Extract video information from page"""
2016         # General data
2017         data = {'title': r'\("video_title", "(.*?)"\)',
2018             'description': r'<div class="datawrap">(.*?)</div>',
2019             'owner': r'\("video_owner_name", "(.*?)"\)',
2020             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2021             }
2022         video_info = {}
2023         for piece in data.keys():
2024             mobj = re.search(data[piece], video_webpage)
2025             if mobj is not None:
2026                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2027
2028         # Video urls
2029         video_urls = {}
2030         for fmt in self._available_formats:
2031             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2032             if mobj is not None:
2033                 # URL is in a Javascript segment inside an escaped Unicode format within
2034                 # the generally utf-8 page
2035                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2036         video_info['video_urls'] = video_urls
2037
2038         return video_info
2039
2040     def _real_initialize(self):
2041         if self._downloader is None:
2042             return
2043
2044         useremail = None
2045         password = None
2046         downloader_params = self._downloader.params
2047
2048         # Attempt to use provided username and password or .netrc data
2049         if downloader_params.get('username', None) is not None:
2050             useremail = downloader_params['username']
2051             password = downloader_params['password']
2052         elif downloader_params.get('usenetrc', False):
2053             try:
2054                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2055                 if info is not None:
2056                     useremail = info[0]
2057                     password = info[2]
2058                 else:
2059                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2060             except (IOError, netrc.NetrcParseError) as err:
2061                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2062                 return
2063
2064         if useremail is None:
2065             return
2066
2067         # Log in
2068         login_form = {
2069             'email': useremail,
2070             'pass': password,
2071             'login': 'Log+In'
2072             }
2073         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2074         try:
2075             self.report_login()
2076             login_results = compat_urllib_request.urlopen(request).read()
2077             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2078                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2079                 return
2080         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2082             return
2083
2084     def _real_extract(self, url):
2085         mobj = re.match(self._VALID_URL, url)
2086         if mobj is None:
2087             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2088             return
2089         video_id = mobj.group('ID')
2090
2091         # Get video webpage
2092         self.report_video_webpage_download(video_id)
2093         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2094         try:
2095             page = compat_urllib_request.urlopen(request)
2096             video_webpage = page.read()
2097         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2098             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2099             return
2100
2101         # Start extracting information
2102         self.report_information_extraction(video_id)
2103
2104         # Extract information
2105         video_info = self._parse_page(video_webpage)
2106
2107         # uploader
2108         if 'owner' not in video_info:
2109             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2110             return
2111         video_uploader = video_info['owner']
2112
2113         # title
2114         if 'title' not in video_info:
2115             self._downloader.trouble(u'ERROR: unable to extract video title')
2116             return
2117         video_title = video_info['title']
2118         video_title = video_title.decode('utf-8')
2119
2120         # thumbnail image
2121         if 'thumbnail' not in video_info:
2122             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2123             video_thumbnail = ''
2124         else:
2125             video_thumbnail = video_info['thumbnail']
2126
2127         # upload date
2128         upload_date = None
2129         if 'upload_date' in video_info:
2130             upload_time = video_info['upload_date']
2131             timetuple = email.utils.parsedate_tz(upload_time)
2132             if timetuple is not None:
2133                 try:
2134                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2135                 except:
2136                     pass
2137
2138         # description
2139         video_description = video_info.get('description', 'No description available.')
2140
2141         url_map = video_info['video_urls']
2142         if url_map:
2143             # Decide which formats to download
2144             req_format = self._downloader.params.get('format', None)
2145             format_limit = self._downloader.params.get('format_limit', None)
2146
2147             if format_limit is not None and format_limit in self._available_formats:
2148                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2149             else:
2150                 format_list = self._available_formats
2151             existing_formats = [x for x in format_list if x in url_map]
2152             if len(existing_formats) == 0:
2153                 self._downloader.trouble(u'ERROR: no known formats available for video')
2154                 return
2155             if req_format is None:
2156                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2157             elif req_format == 'worst':
2158                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2159             elif req_format == '-1':
2160                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2161             else:
2162                 # Specific format
2163                 if req_format not in url_map:
2164                     self._downloader.trouble(u'ERROR: requested format not available')
2165                     return
2166                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2167
2168         results = []
2169         for format_param, video_real_url in video_url_list:
2170             # Extension
2171             video_extension = self._video_extensions.get(format_param, 'mp4')
2172
2173             results.append({
2174                 'id':       video_id.decode('utf-8'),
2175                 'url':      video_real_url.decode('utf-8'),
2176                 'uploader': video_uploader.decode('utf-8'),
2177                 'upload_date':  upload_date,
2178                 'title':    video_title,
2179                 'ext':      video_extension.decode('utf-8'),
2180                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2181                 'thumbnail':    video_thumbnail.decode('utf-8'),
2182                 'description':  video_description.decode('utf-8'),
2183             })
2184         return results
2185
2186 class BlipTVIE(InfoExtractor):
2187     """Information extractor for blip.tv"""
2188
2189     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2190     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2191     IE_NAME = u'blip.tv'
2192
2193     def report_extraction(self, file_id):
2194         """Report information extraction."""
2195         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2196
2197     def report_direct_download(self, title):
2198         """Report information extraction."""
2199         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2200
2201     def _real_extract(self, url):
2202         mobj = re.match(self._VALID_URL, url)
2203         if mobj is None:
2204             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2205             return
2206
2207         if '?' in url:
2208             cchar = '&'
2209         else:
2210             cchar = '?'
2211         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2212         request = compat_urllib_request.Request(json_url)
2213         request.add_header('User-Agent', 'iTunes/10.6.1')
2214         self.report_extraction(mobj.group(1))
2215         info = None
2216         try:
2217             urlh = compat_urllib_request.urlopen(request)
2218             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2219                 basename = url.split('/')[-1]
2220                 title,ext = os.path.splitext(basename)
2221                 title = title.decode('UTF-8')
2222                 ext = ext.replace('.', '')
2223                 self.report_direct_download(title)
2224                 info = {
2225                     'id': title,
2226                     'url': url,
2227                     'uploader': None,
2228                     'upload_date': None,
2229                     'title': title,
2230                     'ext': ext,
2231                     'urlhandle': urlh
2232                 }
2233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2235         if info is None: # Regular URL
2236             try:
2237                 json_code_bytes = urlh.read()
2238                 json_code = json_code_bytes.decode('utf-8')
2239             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2240                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2241                 return
2242
2243             try:
2244                 json_data = json.loads(json_code)
2245                 if 'Post' in json_data:
2246                     data = json_data['Post']
2247                 else:
2248                     data = json_data
2249
2250                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2251                 video_url = data['media']['url']
2252                 umobj = re.match(self._URL_EXT, video_url)
2253                 if umobj is None:
2254                     raise ValueError('Can not determine filename extension')
2255                 ext = umobj.group(1)
2256
2257                 info = {
2258                     'id': data['item_id'],
2259                     'url': video_url,
2260                     'uploader': data['display_name'],
2261                     'upload_date': upload_date,
2262                     'title': data['title'],
2263                     'ext': ext,
2264                     'format': data['media']['mimeType'],
2265                     'thumbnail': data['thumbnailUrl'],
2266                     'description': data['description'],
2267                     'player_url': data['embedUrl'],
2268                     'user_agent': 'iTunes/10.6.1',
2269                 }
2270             except (ValueError,KeyError) as err:
2271                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2272                 return
2273
2274         return [info]
2275
2276
2277 class MyVideoIE(InfoExtractor):
2278     """Information Extractor for myvideo.de."""
2279
2280     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2281     IE_NAME = u'myvideo'
2282
2283     def __init__(self, downloader=None):
2284         InfoExtractor.__init__(self, downloader)
2285
2286     def report_extraction(self, video_id):
2287         """Report information extraction."""
2288         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2289
2290     def _real_extract(self,url):
2291         mobj = re.match(self._VALID_URL, url)
2292         if mobj is None:
2293             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2294             return
2295
2296         video_id = mobj.group(1)
2297
2298         # Get video webpage
2299         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2300         webpage = self._download_webpage(webpage_url, video_id)
2301
2302         self.report_extraction(video_id)
2303         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2304                  webpage)
2305         if mobj is None:
2306             self._downloader.trouble(u'ERROR: unable to extract media URL')
2307             return
2308         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2309
2310         mobj = re.search('<title>([^<]+)</title>', webpage)
2311         if mobj is None:
2312             self._downloader.trouble(u'ERROR: unable to extract title')
2313             return
2314
2315         video_title = mobj.group(1)
2316
2317         return [{
2318             'id':       video_id,
2319             'url':      video_url,
2320             'uploader': None,
2321             'upload_date':  None,
2322             'title':    video_title,
2323             'ext':      u'flv',
2324         }]
2325
2326 class ComedyCentralIE(InfoExtractor):
2327     """Information extractor for The Daily Show and Colbert Report """
2328
2329     # urls can be abbreviations like :thedailyshow or :colbert
2330     # urls for episodes like:
2331     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2332     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2333     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2334     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2335                       |(https?://)?(www\.)?
2336                           (?P<showname>thedailyshow|colbertnation)\.com/
2337                          (full-episodes/(?P<episode>.*)|
2338                           (?P<clip>
2339                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2340                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2341                      $"""
2342
2343     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2344
2345     _video_extensions = {
2346         '3500': 'mp4',
2347         '2200': 'mp4',
2348         '1700': 'mp4',
2349         '1200': 'mp4',
2350         '750': 'mp4',
2351         '400': 'mp4',
2352     }
2353     _video_dimensions = {
2354         '3500': '1280x720',
2355         '2200': '960x540',
2356         '1700': '768x432',
2357         '1200': '640x360',
2358         '750': '512x288',
2359         '400': '384x216',
2360     }
2361
2362     def suitable(self, url):
2363         """Receives a URL and returns True if suitable for this IE."""
2364         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2365
2366     def report_extraction(self, episode_id):
2367         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2368
2369     def report_config_download(self, episode_id, media_id):
2370         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2371
2372     def report_index_download(self, episode_id):
2373         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2374
2375     def _print_formats(self, formats):
2376         print('Available formats:')
2377         for x in formats:
2378             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2379
2380
2381     def _real_extract(self, url):
2382         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2383         if mobj is None:
2384             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2385             return
2386
2387         if mobj.group('shortname'):
2388             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2389                 url = u'http://www.thedailyshow.com/full-episodes/'
2390             else:
2391                 url = u'http://www.colbertnation.com/full-episodes/'
2392             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2393             assert mobj is not None
2394
2395         if mobj.group('clip'):
2396             if mobj.group('showname') == 'thedailyshow':
2397                 epTitle = mobj.group('tdstitle')
2398             else:
2399                 epTitle = mobj.group('cntitle')
2400             dlNewest = False
2401         else:
2402             dlNewest = not mobj.group('episode')
2403             if dlNewest:
2404                 epTitle = mobj.group('showname')
2405             else:
2406                 epTitle = mobj.group('episode')
2407
2408         req = compat_urllib_request.Request(url)
2409         self.report_extraction(epTitle)
2410         try:
2411             htmlHandle = compat_urllib_request.urlopen(req)
2412             html = htmlHandle.read()
2413             webpage = html.decode('utf-8')
2414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2415             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2416             return
2417         if dlNewest:
2418             url = htmlHandle.geturl()
2419             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2420             if mobj is None:
2421                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2422                 return
2423             if mobj.group('episode') == '':
2424                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2425                 return
2426             epTitle = mobj.group('episode')
2427
2428         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2429
2430         if len(mMovieParams) == 0:
2431             # The Colbert Report embeds the information in a without
2432             # a URL prefix; so extract the alternate reference
2433             # and then add the URL prefix manually.
2434
2435             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2436             if len(altMovieParams) == 0:
2437                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2438                 return
2439             else:
2440                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2441
2442         uri = mMovieParams[0][1]
2443         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2444         self.report_index_download(epTitle)
2445         try:
2446             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2448             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2449             return
2450
2451         results = []
2452
2453         idoc = xml.etree.ElementTree.fromstring(indexXml)
2454         itemEls = idoc.findall('.//item')
2455         for partNum,itemEl in enumerate(itemEls):
2456             mediaId = itemEl.findall('./guid')[0].text
2457             shortMediaId = mediaId.split(':')[-1]
2458             showId = mediaId.split(':')[-2].replace('.com', '')
2459             officialTitle = itemEl.findall('./title')[0].text
2460             officialDate = itemEl.findall('./pubDate')[0].text
2461
2462             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2463                         compat_urllib_parse.urlencode({'uri': mediaId}))
2464             configReq = compat_urllib_request.Request(configUrl)
2465             self.report_config_download(epTitle, shortMediaId)
2466             try:
2467                 configXml = compat_urllib_request.urlopen(configReq).read()
2468             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2470                 return
2471
2472             cdoc = xml.etree.ElementTree.fromstring(configXml)
2473             turls = []
2474             for rendition in cdoc.findall('.//rendition'):
2475                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2476                 turls.append(finfo)
2477
2478             if len(turls) == 0:
2479                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2480                 continue
2481
2482             if self._downloader.params.get('listformats', None):
2483                 self._print_formats([i[0] for i in turls])
2484                 return
2485
2486             # For now, just pick the highest bitrate
2487             format,rtmp_video_url = turls[-1]
2488
2489             # Get the format arg from the arg stream
2490             req_format = self._downloader.params.get('format', None)
2491
2492             # Select format if we can find one
2493             for f,v in turls:
2494                 if f == req_format:
2495                     format, rtmp_video_url = f, v
2496                     break
2497
2498             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2499             if not m:
2500                 raise ExtractorError(u'Cannot transform RTMP url')
2501             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2502             video_url = base + m.group('finalid')
2503
2504             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2505             info = {
2506                 'id': shortMediaId,
2507                 'url': video_url,
2508                 'uploader': showId,
2509                 'upload_date': officialDate,
2510                 'title': effTitle,
2511                 'ext': 'mp4',
2512                 'format': format,
2513                 'thumbnail': None,
2514                 'description': officialTitle,
2515             }
2516             results.append(info)
2517
2518         return results
2519
2520
2521 class EscapistIE(InfoExtractor):
2522     """Information extractor for The Escapist """
2523
2524     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2525     IE_NAME = u'escapist'
2526
2527     def report_extraction(self, showName):
2528         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2529
2530     def report_config_download(self, showName):
2531         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2532
2533     def _real_extract(self, url):
2534         mobj = re.match(self._VALID_URL, url)
2535         if mobj is None:
2536             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2537             return
2538         showName = mobj.group('showname')
2539         videoId = mobj.group('episode')
2540
2541         self.report_extraction(showName)
2542         try:
2543             webPage = compat_urllib_request.urlopen(url)
2544             webPageBytes = webPage.read()
2545             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2546             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2547         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2549             return
2550
2551         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2552         description = unescapeHTML(descMatch.group(1))
2553         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2554         imgUrl = unescapeHTML(imgMatch.group(1))
2555         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2556         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2557         configUrlMatch = re.search('config=(.*)$', playerUrl)
2558         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2559
2560         self.report_config_download(showName)
2561         try:
2562             configJSON = compat_urllib_request.urlopen(configUrl)
2563             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2564             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2565         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2567             return
2568
2569         # Technically, it's JavaScript, not JSON
2570         configJSON = configJSON.replace("'", '"')
2571
2572         try:
2573             config = json.loads(configJSON)
2574         except (ValueError,) as err:
2575             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2576             return
2577
2578         playlist = config['playlist']
2579         videoUrl = playlist[1]['url']
2580
2581         info = {
2582             'id': videoId,
2583             'url': videoUrl,
2584             'uploader': showName,
2585             'upload_date': None,
2586             'title': showName,
2587             'ext': 'flv',
2588             'thumbnail': imgUrl,
2589             'description': description,
2590             'player_url': playerUrl,
2591         }
2592
2593         return [info]
2594
2595 class CollegeHumorIE(InfoExtractor):
2596     """Information extractor for collegehumor.com"""
2597
2598     _WORKING = False
2599     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2600     IE_NAME = u'collegehumor'
2601
2602     def report_manifest(self, video_id):
2603         """Report information extraction."""
2604         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2605
2606     def report_extraction(self, video_id):
2607         """Report information extraction."""
2608         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2609
2610     def _real_extract(self, url):
2611         mobj = re.match(self._VALID_URL, url)
2612         if mobj is None:
2613             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2614             return
2615         video_id = mobj.group('videoid')
2616
2617         info = {
2618             'id': video_id,
2619             'uploader': None,
2620             'upload_date': None,
2621         }
2622
2623         self.report_extraction(video_id)
2624         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2625         try:
2626             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2627         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2629             return
2630
2631         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2632         try:
2633             videoNode = mdoc.findall('./video')[0]
2634             info['description'] = videoNode.findall('./description')[0].text
2635             info['title'] = videoNode.findall('./caption')[0].text
2636             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2637             manifest_url = videoNode.findall('./file')[0].text
2638         except IndexError:
2639             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2640             return
2641
2642         manifest_url += '?hdcore=2.10.3'
2643         self.report_manifest(video_id)
2644         try:
2645             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2646         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2647             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2648             return
2649
2650         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2651         try:
2652             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2653             node_id = media_node.attrib['url']
2654             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2655         except IndexError as err:
2656             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2657             return
2658
2659         url_pr = compat_urllib_parse_urlparse(manifest_url)
2660         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2661
2662         info['url'] = url
2663         info['ext'] = 'f4f'
2664         return [info]
2665
2666
2667 class XVideosIE(InfoExtractor):
2668     """Information extractor for xvideos.com"""
2669
2670     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2671     IE_NAME = u'xvideos'
2672
2673     def report_extraction(self, video_id):
2674         """Report information extraction."""
2675         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2676
2677     def _real_extract(self, url):
2678         mobj = re.match(self._VALID_URL, url)
2679         if mobj is None:
2680             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2681             return
2682         video_id = mobj.group(1)
2683
2684         webpage = self._download_webpage(url, video_id)
2685
2686         self.report_extraction(video_id)
2687
2688
2689         # Extract video URL
2690         mobj = re.search(r'flv_url=(.+?)&', webpage)
2691         if mobj is None:
2692             self._downloader.trouble(u'ERROR: unable to extract video url')
2693             return
2694         video_url = compat_urllib_parse.unquote(mobj.group(1))
2695
2696
2697         # Extract title
2698         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2699         if mobj is None:
2700             self._downloader.trouble(u'ERROR: unable to extract video title')
2701             return
2702         video_title = mobj.group(1)
2703
2704
2705         # Extract video thumbnail
2706         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2707         if mobj is None:
2708             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2709             return
2710         video_thumbnail = mobj.group(0)
2711
2712         info = {
2713             'id': video_id,
2714             'url': video_url,
2715             'uploader': None,
2716             'upload_date': None,
2717             'title': video_title,
2718             'ext': 'flv',
2719             'thumbnail': video_thumbnail,
2720             'description': None,
2721         }
2722
2723         return [info]
2724
2725
2726 class SoundcloudIE(InfoExtractor):
2727     """Information extractor for soundcloud.com
2728        To access the media, the uid of the song and a stream token
2729        must be extracted from the page source and the script must make
2730        a request to media.soundcloud.com/crossdomain.xml. Then
2731        the media can be grabbed by requesting from an url composed
2732        of the stream token and uid
2733      """
2734
2735     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2736     IE_NAME = u'soundcloud'
2737
2738     def __init__(self, downloader=None):
2739         InfoExtractor.__init__(self, downloader)
2740
2741     def report_resolve(self, video_id):
2742         """Report information extraction."""
2743         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2744
2745     def report_extraction(self, video_id):
2746         """Report information extraction."""
2747         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2748
2749     def _real_extract(self, url):
2750         mobj = re.match(self._VALID_URL, url)
2751         if mobj is None:
2752             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2753             return
2754
2755         # extract uploader (which is in the url)
2756         uploader = mobj.group(1)
2757         # extract simple title (uploader + slug of song title)
2758         slug_title =  mobj.group(2)
2759         simple_title = uploader + u'-' + slug_title
2760
2761         self.report_resolve('%s/%s' % (uploader, slug_title))
2762
2763         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2764         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2765         request = compat_urllib_request.Request(resolv_url)
2766         try:
2767             info_json_bytes = compat_urllib_request.urlopen(request).read()
2768             info_json = info_json_bytes.decode('utf-8')
2769         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2770             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2771             return
2772
2773         info = json.loads(info_json)
2774         video_id = info['id']
2775         self.report_extraction('%s/%s' % (uploader, slug_title))
2776
2777         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778         request = compat_urllib_request.Request(streams_url)
2779         try:
2780             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2781             stream_json = stream_json_bytes.decode('utf-8')
2782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2784             return
2785
2786         streams = json.loads(stream_json)
2787         mediaURL = streams['http_mp3_128_url']
2788
2789         return [{
2790             'id':       info['id'],
2791             'url':      mediaURL,
2792             'uploader': info['user']['username'],
2793             'upload_date':  info['created_at'],
2794             'title':    info['title'],
2795             'ext':      u'mp3',
2796             'description': info['description'],
2797         }]
2798
2799
2800 class InfoQIE(InfoExtractor):
2801     """Information extractor for infoq.com"""
2802     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2803
2804     def report_extraction(self, video_id):
2805         """Report information extraction."""
2806         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2807
2808     def _real_extract(self, url):
2809         mobj = re.match(self._VALID_URL, url)
2810         if mobj is None:
2811             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2812             return
2813
2814         webpage = self._download_webpage(url, video_id=url)
2815         self.report_extraction(url)
2816
2817         # Extract video URL
2818         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2819         if mobj is None:
2820             self._downloader.trouble(u'ERROR: unable to extract video url')
2821             return
2822         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2823         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2824
2825         # Extract title
2826         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2827         if mobj is None:
2828             self._downloader.trouble(u'ERROR: unable to extract video title')
2829             return
2830         video_title = mobj.group(1)
2831
2832         # Extract description
2833         video_description = u'No description available.'
2834         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2835         if mobj is not None:
2836             video_description = mobj.group(1)
2837
2838         video_filename = video_url.split('/')[-1]
2839         video_id, extension = video_filename.split('.')
2840
2841         info = {
2842             'id': video_id,
2843             'url': video_url,
2844             'uploader': None,
2845             'upload_date': None,
2846             'title': video_title,
2847             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2848             'thumbnail': None,
2849             'description': video_description,
2850         }
2851
2852         return [info]
2853
2854 class MixcloudIE(InfoExtractor):
2855     """Information extractor for www.mixcloud.com"""
2856
2857     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2858     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2859     IE_NAME = u'mixcloud'
2860
2861     def __init__(self, downloader=None):
2862         InfoExtractor.__init__(self, downloader)
2863
2864     def report_download_json(self, file_id):
2865         """Report JSON download."""
2866         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2867
2868     def report_extraction(self, file_id):
2869         """Report information extraction."""
2870         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2871
2872     def get_urls(self, jsonData, fmt, bitrate='best'):
2873         """Get urls from 'audio_formats' section in json"""
2874         file_url = None
2875         try:
2876             bitrate_list = jsonData[fmt]
2877             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2878                 bitrate = max(bitrate_list) # select highest
2879
2880             url_list = jsonData[fmt][bitrate]
2881         except TypeError: # we have no bitrate info.
2882             url_list = jsonData[fmt]
2883         return url_list
2884
2885     def check_urls(self, url_list):
2886         """Returns 1st active url from list"""
2887         for url in url_list:
2888             try:
2889                 compat_urllib_request.urlopen(url)
2890                 return url
2891             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2892                 url = None
2893
2894         return None
2895
2896     def _print_formats(self, formats):
2897         print('Available formats:')
2898         for fmt in formats.keys():
2899             for b in formats[fmt]:
2900                 try:
2901                     ext = formats[fmt][b][0]
2902                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2903                 except TypeError: # we have no bitrate info
2904                     ext = formats[fmt][0]
2905                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2906                     break
2907
2908     def _real_extract(self, url):
2909         mobj = re.match(self._VALID_URL, url)
2910         if mobj is None:
2911             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2912             return
2913         # extract uploader & filename from url
2914         uploader = mobj.group(1).decode('utf-8')
2915         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2916
2917         # construct API request
2918         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2919         # retrieve .json file with links to files
2920         request = compat_urllib_request.Request(file_url)
2921         try:
2922             self.report_download_json(file_url)
2923             jsonData = compat_urllib_request.urlopen(request).read()
2924         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2925             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2926             return
2927
2928         # parse JSON
2929         json_data = json.loads(jsonData)
2930         player_url = json_data['player_swf_url']
2931         formats = dict(json_data['audio_formats'])
2932
2933         req_format = self._downloader.params.get('format', None)
2934         bitrate = None
2935
2936         if self._downloader.params.get('listformats', None):
2937             self._print_formats(formats)
2938             return
2939
2940         if req_format is None or req_format == 'best':
2941             for format_param in formats.keys():
2942                 url_list = self.get_urls(formats, format_param)
2943                 # check urls
2944                 file_url = self.check_urls(url_list)
2945                 if file_url is not None:
2946                     break # got it!
2947         else:
2948             if req_format not in formats:
2949                 self._downloader.trouble(u'ERROR: format is not available')
2950                 return
2951
2952             url_list = self.get_urls(formats, req_format)
2953             file_url = self.check_urls(url_list)
2954             format_param = req_format
2955
2956         return [{
2957             'id': file_id.decode('utf-8'),
2958             'url': file_url.decode('utf-8'),
2959             'uploader': uploader.decode('utf-8'),
2960             'upload_date': None,
2961             'title': json_data['name'],
2962             'ext': file_url.split('.')[-1].decode('utf-8'),
2963             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2964             'thumbnail': json_data['thumbnail_url'],
2965             'description': json_data['description'],
2966             'player_url': player_url.decode('utf-8'),
2967         }]
2968
2969 class StanfordOpenClassroomIE(InfoExtractor):
2970     """Information extractor for Stanford's Open ClassRoom"""
2971
2972     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2973     IE_NAME = u'stanfordoc'
2974
2975     def report_download_webpage(self, objid):
2976         """Report information extraction."""
2977         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2978
2979     def report_extraction(self, video_id):
2980         """Report information extraction."""
2981         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2982
2983     def _real_extract(self, url):
2984         mobj = re.match(self._VALID_URL, url)
2985         if mobj is None:
2986             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2987             return
2988
2989         if mobj.group('course') and mobj.group('video'): # A specific video
2990             course = mobj.group('course')
2991             video = mobj.group('video')
2992             info = {
2993                 'id': course + '_' + video,
2994                 'uploader': None,
2995                 'upload_date': None,
2996             }
2997
2998             self.report_extraction(info['id'])
2999             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3000             xmlUrl = baseUrl + video + '.xml'
3001             try:
3002                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3003             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3005                 return
3006             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3007             try:
3008                 info['title'] = mdoc.findall('./title')[0].text
3009                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3010             except IndexError:
3011                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3012                 return
3013             info['ext'] = info['url'].rpartition('.')[2]
3014             return [info]
3015         elif mobj.group('course'): # A course page
3016             course = mobj.group('course')
3017             info = {
3018                 'id': course,
3019                 'type': 'playlist',
3020                 'uploader': None,
3021                 'upload_date': None,
3022             }
3023
3024             self.report_download_webpage(info['id'])
3025             try:
3026                 coursepage = compat_urllib_request.urlopen(url).read()
3027             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3029                 return
3030
3031             m = re.search('<h1>([^<]+)</h1>', coursepage)
3032             if m:
3033                 info['title'] = unescapeHTML(m.group(1))
3034             else:
3035                 info['title'] = info['id']
3036
3037             m = re.search('<description>([^<]+)</description>', coursepage)
3038             if m:
3039                 info['description'] = unescapeHTML(m.group(1))
3040
3041             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3042             info['list'] = [
3043                 {
3044                     'type': 'reference',
3045                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3046                 }
3047                     for vpage in links]
3048             results = []
3049             for entry in info['list']:
3050                 assert entry['type'] == 'reference'
3051                 results += self.extract(entry['url'])
3052             return results
3053
3054         else: # Root page
3055             info = {
3056                 'id': 'Stanford OpenClassroom',
3057                 'type': 'playlist',
3058                 'uploader': None,
3059                 'upload_date': None,
3060             }
3061
3062             self.report_download_webpage(info['id'])
3063             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3064             try:
3065                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3066             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3067                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3068                 return
3069
3070             info['title'] = info['id']
3071
3072             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3073             info['list'] = [
3074                 {
3075                     'type': 'reference',
3076                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3077                 }
3078                     for cpage in links]
3079
3080             results = []
3081             for entry in info['list']:
3082                 assert entry['type'] == 'reference'
3083                 results += self.extract(entry['url'])
3084             return results
3085
3086 class MTVIE(InfoExtractor):
3087     """Information extractor for MTV.com"""
3088
3089     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3090     IE_NAME = u'mtv'
3091
3092     def report_extraction(self, video_id):
3093         """Report information extraction."""
3094         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3095
3096     def _real_extract(self, url):
3097         mobj = re.match(self._VALID_URL, url)
3098         if mobj is None:
3099             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3100             return
3101         if not mobj.group('proto'):
3102             url = 'http://' + url
3103         video_id = mobj.group('videoid')
3104
3105         webpage = self._download_webpage(url, video_id)
3106
3107         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3108         if mobj is None:
3109             self._downloader.trouble(u'ERROR: unable to extract song name')
3110             return
3111         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3112         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3113         if mobj is None:
3114             self._downloader.trouble(u'ERROR: unable to extract performer')
3115             return
3116         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3117         video_title = performer + ' - ' + song_name
3118
3119         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3120         if mobj is None:
3121             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3122             return
3123         mtvn_uri = mobj.group(1)
3124
3125         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3126         if mobj is None:
3127             self._downloader.trouble(u'ERROR: unable to extract content id')
3128             return
3129         content_id = mobj.group(1)
3130
3131         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3132         self.report_extraction(video_id)
3133         request = compat_urllib_request.Request(videogen_url)
3134         try:
3135             metadataXml = compat_urllib_request.urlopen(request).read()
3136         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3137             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3138             return
3139
3140         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3141         renditions = mdoc.findall('.//rendition')
3142
3143         # For now, always pick the highest quality.
3144         rendition = renditions[-1]
3145
3146         try:
3147             _,_,ext = rendition.attrib['type'].partition('/')
3148             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3149             video_url = rendition.find('./src').text
3150         except KeyError:
3151             self._downloader.trouble('Invalid rendition field.')
3152             return
3153
3154         info = {
3155             'id': video_id,
3156             'url': video_url,
3157             'uploader': performer,
3158             'upload_date': None,
3159             'title': video_title,
3160             'ext': ext,
3161             'format': format,
3162         }
3163
3164         return [info]
3165
3166
3167 class YoukuIE(InfoExtractor):
3168     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3169
3170     def report_download_webpage(self, file_id):
3171         """Report webpage download."""
3172         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3173
3174     def report_extraction(self, file_id):
3175         """Report information extraction."""
3176         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3177
3178     def _gen_sid(self):
3179         nowTime = int(time.time() * 1000)
3180         random1 = random.randint(1000,1998)
3181         random2 = random.randint(1000,9999)
3182
3183         return "%d%d%d" %(nowTime,random1,random2)
3184
3185     def _get_file_ID_mix_string(self, seed):
3186         mixed = []
3187         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3188         seed = float(seed)
3189         for i in range(len(source)):
3190             seed  =  (seed * 211 + 30031 ) % 65536
3191             index  =  math.floor(seed / 65536 * len(source) )
3192             mixed.append(source[int(index)])
3193             source.remove(source[int(index)])
3194         #return ''.join(mixed)
3195         return mixed
3196
3197     def _get_file_id(self, fileId, seed):
3198         mixed = self._get_file_ID_mix_string(seed)
3199         ids = fileId.split('*')
3200         realId = []
3201         for ch in ids:
3202             if ch:
3203                 realId.append(mixed[int(ch)])
3204         return ''.join(realId)
3205
3206     def _real_extract(self, url):
3207         mobj = re.match(self._VALID_URL, url)
3208         if mobj is None:
3209             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3210             return
3211         video_id = mobj.group('ID')
3212
3213         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3214
3215         request = compat_urllib_request.Request(info_url, None, std_headers)
3216         try:
3217             self.report_download_webpage(video_id)
3218             jsondata = compat_urllib_request.urlopen(request).read()
3219         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3220             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3221             return
3222
3223         self.report_extraction(video_id)
3224         try:
3225             jsonstr = jsondata.decode('utf-8')
3226             config = json.loads(jsonstr)
3227
3228             video_title =  config['data'][0]['title']
3229             seed = config['data'][0]['seed']
3230
3231             format = self._downloader.params.get('format', None)
3232             supported_format = list(config['data'][0]['streamfileids'].keys())
3233
3234             if format is None or format == 'best':
3235                 if 'hd2' in supported_format:
3236                     format = 'hd2'
3237                 else:
3238                     format = 'flv'
3239                 ext = u'flv'
3240             elif format == 'worst':
3241                 format = 'mp4'
3242                 ext = u'mp4'
3243             else:
3244                 format = 'flv'
3245                 ext = u'flv'
3246
3247
3248             fileid = config['data'][0]['streamfileids'][format]
3249             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3250         except (UnicodeDecodeError, ValueError, KeyError):
3251             self._downloader.trouble(u'ERROR: unable to extract info section')
3252             return
3253
3254         files_info=[]
3255         sid = self._gen_sid()
3256         fileid = self._get_file_id(fileid, seed)
3257
3258         #column 8,9 of fileid represent the segment number
3259         #fileid[7:9] should be changed
3260         for index, key in enumerate(keys):
3261
3262             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3263             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3264
3265             info = {
3266                 'id': '%s_part%02d' % (video_id, index),
3267                 'url': download_url,
3268                 'uploader': None,
3269                 'upload_date': None,
3270                 'title': video_title,
3271                 'ext': ext,
3272             }
3273             files_info.append(info)
3274
3275         return files_info
3276
3277
3278 class XNXXIE(InfoExtractor):
3279     """Information extractor for xnxx.com"""
3280
3281     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3282     IE_NAME = u'xnxx'
3283     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3284     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3285     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3286
3287     def report_webpage(self, video_id):
3288         """Report information extraction"""
3289         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3290
3291     def report_extraction(self, video_id):
3292         """Report information extraction"""
3293         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3294
3295     def _real_extract(self, url):
3296         mobj = re.match(self._VALID_URL, url)
3297         if mobj is None:
3298             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3299             return
3300         video_id = mobj.group(1)
3301
3302         self.report_webpage(video_id)
3303
3304         # Get webpage content
3305         try:
3306             webpage_bytes = compat_urllib_request.urlopen(url).read()
3307             webpage = webpage_bytes.decode('utf-8')
3308         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3309             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3310             return
3311
3312         result = re.search(self.VIDEO_URL_RE, webpage)
3313         if result is None:
3314             self._downloader.trouble(u'ERROR: unable to extract video url')
3315             return
3316         video_url = compat_urllib_parse.unquote(result.group(1))
3317
3318         result = re.search(self.VIDEO_TITLE_RE, webpage)
3319         if result is None:
3320             self._downloader.trouble(u'ERROR: unable to extract video title')
3321             return
3322         video_title = result.group(1)
3323
3324         result = re.search(self.VIDEO_THUMB_RE, webpage)
3325         if result is None:
3326             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3327             return
3328         video_thumbnail = result.group(1)
3329
3330         return [{
3331             'id': video_id,
3332             'url': video_url,
3333             'uploader': None,
3334             'upload_date': None,
3335             'title': video_title,
3336             'ext': 'flv',
3337             'thumbnail': video_thumbnail,
3338             'description': None,
3339         }]
3340
3341
3342 class GooglePlusIE(InfoExtractor):
3343     """Information extractor for plus.google.com."""
3344
3345     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3346     IE_NAME = u'plus.google'
3347
3348     def __init__(self, downloader=None):
3349         InfoExtractor.__init__(self, downloader)
3350
3351     def report_extract_entry(self, url):
3352         """Report downloading extry"""
3353         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3354
3355     def report_date(self, upload_date):
3356         """Report downloading extry"""
3357         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3358
3359     def report_uploader(self, uploader):
3360         """Report downloading extry"""
3361         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3362
3363     def report_title(self, video_title):
3364         """Report downloading extry"""
3365         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3366
3367     def report_extract_vid_page(self, video_page):
3368         """Report information extraction."""
3369         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3370
3371     def _real_extract(self, url):
3372         # Extract id from URL
3373         mobj = re.match(self._VALID_URL, url)
3374         if mobj is None:
3375             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3376             return
3377
3378         post_url = mobj.group(0)
3379         video_id = mobj.group(1)
3380
3381         video_extension = 'flv'
3382
3383         # Step 1, Retrieve post webpage to extract further information
3384         self.report_extract_entry(post_url)
3385         request = compat_urllib_request.Request(post_url)
3386         try:
3387             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3388         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3390             return
3391
3392         # Extract update date
3393         upload_date = None
3394         pattern = 'title="Timestamp">(.*?)</a>'
3395         mobj = re.search(pattern, webpage)
3396         if mobj:
3397             upload_date = mobj.group(1)
3398             # Convert timestring to a format suitable for filename
3399             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3400             upload_date = upload_date.strftime('%Y%m%d')
3401         self.report_date(upload_date)
3402
3403         # Extract uploader
3404         uploader = None
3405         pattern = r'rel\="author".*?>(.*?)</a>'
3406         mobj = re.search(pattern, webpage)
3407         if mobj:
3408             uploader = mobj.group(1)
3409         self.report_uploader(uploader)
3410
3411         # Extract title
3412         # Get the first line for title
3413         video_title = u'NA'
3414         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3415         mobj = re.search(pattern, webpage)
3416         if mobj:
3417             video_title = mobj.group(1)
3418         self.report_title(video_title)
3419
3420         # Step 2, Stimulate clicking the image box to launch video
3421         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3422         mobj = re.search(pattern, webpage)
3423         if mobj is None:
3424             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3425
3426         video_page = mobj.group(1)
3427         request = compat_urllib_request.Request(video_page)
3428         try:
3429             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3430         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3431             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3432             return
3433         self.report_extract_vid_page(video_page)
3434
3435
3436         # Extract video links on video page
3437         """Extract video links of all sizes"""
3438         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3439         mobj = re.findall(pattern, webpage)
3440         if len(mobj) == 0:
3441             self._downloader.trouble(u'ERROR: unable to extract video links')
3442
3443         # Sort in resolution
3444         links = sorted(mobj)
3445
3446         # Choose the lowest of the sort, i.e. highest resolution
3447         video_url = links[-1]
3448         # Only get the url. The resolution part in the tuple has no use anymore
3449         video_url = video_url[-1]
3450         # Treat escaped \u0026 style hex
3451         try:
3452             video_url = video_url.decode("unicode_escape")
3453         except AttributeError: # Python 3
3454             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3455
3456
3457         return [{
3458             'id':       video_id,
3459             'url':      video_url,
3460             'uploader': uploader,
3461             'upload_date':  upload_date,
3462             'title':    video_title,
3463             'ext':      video_extension,
3464         }]
3465
3466 class NBAIE(InfoExtractor):
3467     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3468     IE_NAME = u'nba'
3469
3470     def _real_extract(self, url):
3471         mobj = re.match(self._VALID_URL, url)
3472         if mobj is None:
3473             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3474             return
3475
3476         video_id = mobj.group(1)
3477         if video_id.endswith('/index.html'):
3478             video_id = video_id[:-len('/index.html')]
3479
3480         webpage = self._download_webpage(url, video_id)
3481
3482         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3483         def _findProp(rexp, default=None):
3484             m = re.search(rexp, webpage)
3485             if m:
3486                 return unescapeHTML(m.group(1))
3487             else:
3488                 return default
3489
3490         shortened_video_id = video_id.rpartition('/')[2]
3491         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3492         info = {
3493             'id': shortened_video_id,
3494             'url': video_url,
3495             'ext': 'mp4',
3496             'title': title,
3497             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3498             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3499         }
3500         return [info]
3501
3502 class JustinTVIE(InfoExtractor):
3503     """Information extractor for justin.tv and twitch.tv"""
3504     # TODO: One broadcast may be split into multiple videos. The key
3505     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3506     # starts at 1 and increases. Can we treat all parts as one video?
3507
3508     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3509         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3510     _JUSTIN_PAGE_LIMIT = 100
3511     IE_NAME = u'justin.tv'
3512
3513     def report_extraction(self, file_id):
3514         """Report information extraction."""
3515         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3516
3517     def report_download_page(self, channel, offset):
3518         """Report attempt to download a single page of videos."""
3519         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3520                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3521
3522     # Return count of items, list of *valid* items
3523     def _parse_page(self, url):
3524         try:
3525             urlh = compat_urllib_request.urlopen(url)
3526             webpage_bytes = urlh.read()
3527             webpage = webpage_bytes.decode('utf-8', 'ignore')
3528         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3529             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3530             return
3531
3532         response = json.loads(webpage)
3533         if type(response) != list:
3534             error_text = response.get('error', 'unknown error')
3535             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3536             return
3537         info = []
3538         for clip in response:
3539             video_url = clip['video_file_url']
3540             if video_url:
3541                 video_extension = os.path.splitext(video_url)[1][1:]
3542                 video_date = re.sub('-', '', clip['start_time'][:10])
3543                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3544                 info.append({
3545                     'id': clip['id'],
3546                     'url': video_url,
3547                     'title': clip['title'],
3548                     'uploader': clip.get('channel_name', video_uploader_id),
3549                     'uploader_id': video_uploader_id,
3550                     'upload_date': video_date,
3551                     'ext': video_extension,
3552                 })
3553         return (len(response), info)
3554
3555     def _real_extract(self, url):
3556         mobj = re.match(self._VALID_URL, url)
3557         if mobj is None:
3558             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3559             return
3560
3561         api = 'http://api.justin.tv'
3562         video_id = mobj.group(mobj.lastindex)
3563         paged = False
3564         if mobj.lastindex == 1:
3565             paged = True
3566             api += '/channel/archives/%s.json'
3567         else:
3568             api += '/broadcast/by_archive/%s.json'
3569         api = api % (video_id,)
3570
3571         self.report_extraction(video_id)
3572
3573         info = []
3574         offset = 0
3575         limit = self._JUSTIN_PAGE_LIMIT
3576         while True:
3577             if paged:
3578                 self.report_download_page(video_id, offset)
3579             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3580             page_count, page_info = self._parse_page(page_url)
3581             info.extend(page_info)
3582             if not paged or page_count != limit:
3583                 break
3584             offset += limit
3585         return info
3586
3587 class FunnyOrDieIE(InfoExtractor):
3588     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3589
3590     def _real_extract(self, url):
3591         mobj = re.match(self._VALID_URL, url)
3592         if mobj is None:
3593             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3594             return
3595
3596         video_id = mobj.group('id')
3597         webpage = self._download_webpage(url, video_id)
3598
3599         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3600         if not m:
3601             self._downloader.trouble(u'ERROR: unable to find video information')
3602         video_url = unescapeHTML(m.group('url'))
3603
3604         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3605         if not m:
3606             self._downloader.trouble(u'Cannot find video title')
3607         title = unescapeHTML(m.group('title'))
3608
3609         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3610         if m:
3611             desc = unescapeHTML(m.group('desc'))
3612         else:
3613             desc = None
3614
3615         info = {
3616             'id': video_id,
3617             'url': video_url,
3618             'ext': 'mp4',
3619             'title': title,
3620             'description': desc,
3621         }
3622         return [info]
3623
3624 class TweetReelIE(InfoExtractor):
3625     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3626
3627     def _real_extract(self, url):
3628         mobj = re.match(self._VALID_URL, url)
3629         if mobj is None:
3630             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3631             return
3632
3633         video_id = mobj.group('id')
3634         webpage = self._download_webpage(url, video_id)
3635
3636         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3637         if not m:
3638             self._downloader.trouble(u'ERROR: Cannot find status ID')
3639         status_id = m.group(1)
3640
3641         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3642         if not m:
3643             self._downloader.trouble(u'WARNING: Cannot find description')
3644         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3645
3646         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3647         if not m:
3648             self._downloader.trouble(u'ERROR: Cannot find uploader')
3649         uploader = unescapeHTML(m.group('uploader'))
3650         uploader_id = unescapeHTML(m.group('uploader_id'))
3651
3652         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3653         if not m:
3654             self._downloader.trouble(u'ERROR: Cannot find upload date')
3655         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3656
3657         title = desc
3658         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3659
3660         info = {
3661             'id': video_id,
3662             'url': video_url,
3663             'ext': 'mov',
3664             'title': title,
3665             'description': desc,
3666             'uploader': uploader,
3667             'uploader_id': uploader_id,
3668             'internal_id': status_id,
3669             'upload_date': upload_date
3670         }
3671         return [info]
3672         
3673 class SteamIE(InfoExtractor):
3674     _VALID_URL = r"""http://store.steampowered.com/ 
3675                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3676                 (?P<gameID>\d+)/?
3677                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3678                 """
3679
3680     def suitable(self, url):
3681         """Receives a URL and returns True if suitable for this IE."""
3682         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3683
3684     def _real_extract(self, url):
3685         m = re.match(self._VALID_URL, url, re.VERBOSE)
3686         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3687         gameID = m.group('gameID')
3688         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3689         webpage = self._download_webpage(videourl, gameID)
3690         mweb = re.finditer(urlRE, webpage)
3691         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3692         titles = re.finditer(namesRE, webpage)
3693         videos = []
3694         for vid,vtitle in zip(mweb,titles):
3695             video_id = vid.group('videoID')
3696             title = vtitle.group('videoName')
3697             video_url = vid.group('videoURL')
3698             if not video_url:
3699                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3700             info = {
3701                 'id':video_id,
3702                 'url':video_url,
3703                 'ext': 'flv',
3704                 'title': unescapeHTML(title)
3705                   }
3706             videos.append(info)
3707         return videos
3708
3709 class UstreamIE(InfoExtractor):
3710     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3711     IE_NAME = u'ustream'
3712
3713     def _real_extract(self, url):
3714         m = re.match(self._VALID_URL, url)
3715         video_id = m.group('videoID')
3716         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3717         webpage = self._download_webpage(url, video_id)
3718         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3719         title = m.group('title')
3720         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3721         uploader = m.group('uploader')
3722         info = {
3723                 'id':video_id,
3724                 'url':video_url,
3725                 'ext': 'flv',
3726                 'title': title,
3727                 'uploader': uploader
3728                   }
3729         return [info]
3730
3731
3732
3733 class YouPornIE(InfoExtractor):
3734     """Information extractor for youporn.com."""
3735     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3736    
3737     def _print_formats(self, formats):
3738         """Print all available formats"""
3739         print(u'Available formats:')
3740         print(u'ext\t\tformat')
3741         print(u'---------------------------------')
3742         for format in formats:
3743             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3744
3745     def _specific(self, req_format, formats):
3746         for x in formats:
3747             if(x["format"]==req_format):
3748                 return x
3749         return None
3750
3751     def _real_extract(self, url):
3752         mobj = re.match(self._VALID_URL, url)
3753         if mobj is None:
3754             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3755             return
3756
3757         video_id = mobj.group('videoid')
3758
3759         req = compat_urllib_request.Request(url)
3760         req.add_header('Cookie', 'age_verified=1')
3761         webpage = self._download_webpage(req, video_id)
3762
3763         # Get the video title
3764         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3765         if result is None:
3766             raise ExtractorError(u'ERROR: unable to extract video title')
3767         video_title = result.group('title').strip()
3768
3769         # Get the video date
3770         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3771         if result is None:
3772             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3773             upload_date = None
3774         else:
3775             upload_date = result.group('date').strip()
3776
3777         # Get the video uploader
3778         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3779         if result is None:
3780             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3781             video_uploader = None
3782         else:
3783             video_uploader = result.group('uploader').strip()
3784             video_uploader = clean_html( video_uploader )
3785
3786         # Get all of the formats available
3787         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3788         result = re.search(DOWNLOAD_LIST_RE, webpage)
3789         if result is None:
3790             raise ExtractorError(u'Unable to extract download list')
3791         download_list_html = result.group('download_list').strip()
3792
3793         # Get all of the links from the page
3794         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3795         links = re.findall(LINK_RE, download_list_html)
3796         if(len(links) == 0):
3797             raise ExtractorError(u'ERROR: no known formats available for video')
3798         
3799         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3800
3801         formats = []
3802         for link in links:
3803
3804             # A link looks like this:
3805             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3806             # A path looks like this:
3807             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3808             video_url = unescapeHTML( link )
3809             path = compat_urllib_parse_urlparse( video_url ).path
3810             extension = os.path.splitext( path )[1][1:]
3811             format = path.split('/')[4].split('_')[:2]
3812             size = format[0]
3813             bitrate = format[1]
3814             format = "-".join( format )
3815             title = u'%s-%s-%s' % (video_title, size, bitrate)
3816
3817             formats.append({
3818                 'id': video_id,
3819                 'url': video_url,
3820                 'uploader': video_uploader,
3821                 'upload_date': upload_date,
3822                 'title': title,
3823                 'ext': extension,
3824                 'format': format,
3825                 'thumbnail': None,
3826                 'description': None,
3827                 'player_url': None
3828             })
3829
3830         if self._downloader.params.get('listformats', None):
3831             self._print_formats(formats)
3832             return
3833
3834         req_format = self._downloader.params.get('format', None)
3835         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3836
3837         if req_format is None or req_format == 'best':
3838             return [formats[0]]
3839         elif req_format == 'worst':
3840             return [formats[-1]]
3841         elif req_format in ('-1', 'all'):
3842             return formats
3843         else:
3844             format = self._specific( req_format, formats )
3845             if result is None:
3846                 self._downloader.trouble(u'ERROR: requested format not available')
3847                 return
3848             return [format]
3849
3850         
3851
3852 class PornotubeIE(InfoExtractor):
3853     """Information extractor for pornotube.com."""
3854     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3855
3856     def _real_extract(self, url):
3857         mobj = re.match(self._VALID_URL, url)
3858         if mobj is None:
3859             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3860             return
3861
3862         video_id = mobj.group('videoid')
3863         video_title = mobj.group('title')
3864
3865         # Get webpage content
3866         webpage = self._download_webpage(url, video_id)
3867
3868         # Get the video URL
3869         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3870         result = re.search(VIDEO_URL_RE, webpage)
3871         if result is None:
3872             self._downloader.trouble(u'ERROR: unable to extract video url')
3873             return
3874         video_url = compat_urllib_parse.unquote(result.group('url'))
3875
3876         #Get the uploaded date
3877         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3878         result = re.search(VIDEO_UPLOADED_RE, webpage)
3879         if result is None:
3880             self._downloader.trouble(u'ERROR: unable to extract video title')
3881             return
3882         upload_date = result.group('date')
3883
3884         info = {'id': video_id,
3885                 'url': video_url,
3886                 'uploader': None,
3887                 'upload_date': upload_date,
3888                 'title': video_title,
3889                 'ext': 'flv',
3890                 'format': 'flv'}
3891
3892         return [info]
3893
3894
3895
3896 class YouJizzIE(InfoExtractor):
3897     """Information extractor for youjizz.com."""
3898     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3899
3900     def _real_extract(self, url):
3901         mobj = re.match(self._VALID_URL, url)
3902         if mobj is None:
3903             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3904             return
3905
3906         video_id = mobj.group('videoid')
3907
3908         # Get webpage content
3909         webpage = self._download_webpage(url, video_id)
3910
3911         # Get the video title
3912         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3913         if result is None:
3914             raise ExtractorError(u'ERROR: unable to extract video title')
3915         video_title = result.group('title').strip()
3916
3917         # Get the embed page
3918         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3919         if result is None:
3920             raise ExtractorError(u'ERROR: unable to extract embed page')
3921
3922         embed_page_url = result.group(0).strip()
3923         video_id = result.group('videoid')
3924     
3925         webpage = self._download_webpage(embed_page_url, video_id)
3926
3927         # Get the video URL
3928         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3929         if result is None:
3930             raise ExtractorError(u'ERROR: unable to extract video url')
3931         video_url = result.group('source')
3932
3933         info = {'id': video_id,
3934                 'url': video_url,
3935                 'title': video_title,
3936                 'ext': 'flv',
3937                 'format': 'flv',
3938                 'player_url': embed_page_url}
3939
3940         return [info]
3941
3942
3943 def gen_extractors():
3944     """ Return a list of an instance of every supported extractor.
3945     The order does matter; the first extractor matched is the one handling the URL.
3946     """
3947     return [
3948         YoutubePlaylistIE(),
3949         YoutubeChannelIE(),
3950         YoutubeUserIE(),
3951         YoutubeSearchIE(),
3952         YoutubeIE(),
3953         MetacafeIE(),
3954         DailymotionIE(),
3955         GoogleSearchIE(),
3956         PhotobucketIE(),
3957         YahooIE(),
3958         YahooSearchIE(),
3959         DepositFilesIE(),
3960         FacebookIE(),
3961         BlipTVUserIE(),
3962         BlipTVIE(),
3963         VimeoIE(),
3964         MyVideoIE(),
3965         ComedyCentralIE(),
3966         EscapistIE(),
3967         CollegeHumorIE(),
3968         XVideosIE(),
3969         SoundcloudIE(),
3970         InfoQIE(),
3971         MixcloudIE(),
3972         StanfordOpenClassroomIE(),
3973         MTVIE(),
3974         YoukuIE(),
3975         XNXXIE(),
3976         YouJizzIE(),
3977         PornotubeIE(),
3978         YouPornIE(),
3979         GooglePlusIE(),
3980         ArteTvIE(),
3981         NBAIE(),
3982         JustinTVIE(),
3983         FunnyOrDieIE(),
3984         TweetReelIE(),
3985         SteamIE(),
3986         UstreamIE(),
3987         GenericIE()
3988     ]
3989
3990