Add KeekIE()
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18
19 from .utils import *
20
21
22 class InfoExtractor(object):
23     """Information Extractor class.
24
25     Information extractors are the classes that, given a URL, extract
26     information about the video (or videos) the URL refers to. This
27     information includes the real video URL, the video title, author and
28     others. The information is stored in a dictionary which is then
29     passed to the FileDownloader. The FileDownloader processes this
30     information possibly downloading the video to the file system, among
31     other possible outcomes.
32
33     The dictionaries must include the following fields:
34
35     id:             Video identifier.
36     url:            Final video URL.
37     title:          Video title, unescaped.
38     ext:            Video filename extension.
39
40     The following fields are optional:
41
42     format:         The video format, defaults to ext (used for --get-format)
43     thumbnail:      Full URL to a video thumbnail image.
44     description:    One-line video description.
45     uploader:       Full name of the video uploader.
46     upload_date:    Video upload date (YYYYMMDD).
47     uploader_id:    Nickname or id of the video uploader.
48     location:       Physical location of the video.
49     player_url:     SWF Player URL (used for rtmpdump).
50     subtitles:      The .srt file contents.
51     urlhandle:      [internal] The urlHandle to be used to download the file,
52                     like returned by urllib.request.urlopen
53
54     The fields should all be Unicode strings.
55
56     Subclasses of this one should re-define the _real_initialize() and
57     _real_extract() methods and define a _VALID_URL regexp.
58     Probably, they should also be added to the list of extractors.
59
60     _real_extract() must return a *list* of information dictionaries as
61     described above.
62
63     Finally, the _WORKING attribute should be set to False for broken IEs
64     in order to warn the users and skip the tests.
65     """
66
67     _ready = False
68     _downloader = None
69     _WORKING = True
70
71     def __init__(self, downloader=None):
72         """Constructor. Receives an optional downloader."""
73         self._ready = False
74         self.set_downloader(downloader)
75
76     def suitable(self, url):
77         """Receives a URL and returns True if suitable for this IE."""
78         return re.match(self._VALID_URL, url) is not None
79
80     def working(self):
81         """Getter method for _WORKING."""
82         return self._WORKING
83
84     def initialize(self):
85         """Initializes an instance (authentication, etc)."""
86         if not self._ready:
87             self._real_initialize()
88             self._ready = True
89
90     def extract(self, url):
91         """Extracts URL information and returns it in list of dicts."""
92         self.initialize()
93         return self._real_extract(url)
94
95     def set_downloader(self, downloader):
96         """Sets the downloader for this IE."""
97         self._downloader = downloader
98
99     def _real_initialize(self):
100         """Real initialization process. Redefine in subclasses."""
101         pass
102
103     def _real_extract(self, url):
104         """Real extraction process. Redefine in subclasses."""
105         pass
106
107     @property
108     def IE_NAME(self):
109         return type(self).__name__[:-2]
110
111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112         """ Returns the response handle """
113         if note is None:
114             note = u'Downloading video webpage'
115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116         try:
117             return compat_urllib_request.urlopen(url_or_request)
118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119             if errnote is None:
120                 errnote = u'Unable to download webpage'
121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122
123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124         """ Returns the data of the page as a string """
125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126         webpage_bytes = urlh.read()
127         return webpage_bytes.decode('utf-8', 'replace')
128
129
130 class YoutubeIE(InfoExtractor):
131     """Information extractor for youtube.com."""
132
133     _VALID_URL = r"""^
134                      (
135                          (?:https?://)?                                       # http(s):// (optional)
136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
140                          (?:                                                  # the various things that can precede the ID:
141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
142                              |(?:                                             # or the v= param in all its forms
143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
146                                  v=
147                              )
148                          )?                                                   # optional -> youtube.com/xxxx is OK
149                      )?                                                       # all until now is optional -> you can pass the naked ID
150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
151                      (?(1).+)?                                                # if we found the ID, everything can follow
152                      $"""
153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157     _NETRC_MACHINE = 'youtube'
158     # Listed in order of quality
159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161     _video_extensions = {
162         '13': '3gp',
163         '17': 'mp4',
164         '18': 'mp4',
165         '22': 'mp4',
166         '37': 'mp4',
167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
168         '43': 'webm',
169         '44': 'webm',
170         '45': 'webm',
171         '46': 'webm',
172     }
173     _video_dimensions = {
174         '5': '240x400',
175         '6': '???',
176         '13': '???',
177         '17': '144x176',
178         '18': '360x640',
179         '22': '720x1280',
180         '34': '360x640',
181         '35': '480x854',
182         '37': '1080x1920',
183         '38': '3072x4096',
184         '43': '360x640',
185         '44': '480x854',
186         '45': '720x1280',
187         '46': '1080x1920',
188     }
189     IE_NAME = u'youtube'
190
191     def suitable(self, url):
192         """Receives a URL and returns True if suitable for this IE."""
193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194
195     def report_lang(self):
196         """Report attempt to set language."""
197         self._downloader.to_screen(u'[youtube] Setting language')
198
199     def report_login(self):
200         """Report attempt to log in."""
201         self._downloader.to_screen(u'[youtube] Logging in')
202
203     def report_age_confirmation(self):
204         """Report attempt to confirm age."""
205         self._downloader.to_screen(u'[youtube] Confirming age')
206
207     def report_video_webpage_download(self, video_id):
208         """Report attempt to download video webpage."""
209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210
211     def report_video_info_webpage_download(self, video_id):
212         """Report attempt to download video info webpage."""
213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214
215     def report_video_subtitles_download(self, video_id):
216         """Report attempt to download video info webpage."""
217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218
219     def report_information_extraction(self, video_id):
220         """Report attempt to extract video information."""
221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222
223     def report_unavailable_format(self, video_id, format):
224         """Report extracted video URL."""
225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226
227     def report_rtmp_download(self):
228         """Indicate the download will use the RTMP protocol."""
229         self._downloader.to_screen(u'[youtube] RTMP download detected')
230
231     def _closed_captions_xml_to_srt(self, xml_string):
232         srt = ''
233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234         # TODO parse xml instead of regex
235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
236             if not dur: dur = '4'
237             start = float(start)
238             end = start + float(dur)
239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241             caption = unescapeHTML(caption)
242             caption = unescapeHTML(caption) # double cycle, intentional
243             srt += str(n+1) + '\n'
244             srt += start + ' --> ' + end + '\n'
245             srt += caption + '\n\n'
246         return srt
247
248     def _extract_subtitles(self, video_id):
249         self.report_video_subtitles_download(video_id)
250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251         try:
252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257         if not srt_lang_list:
258             return (u'WARNING: video has no closed captions', None)
259         if self._downloader.params.get('subtitleslang', False):
260             srt_lang = self._downloader.params.get('subtitleslang')
261         elif 'en' in srt_lang_list:
262             srt_lang = 'en'
263         else:
264             srt_lang = list(srt_lang_list.keys())[0]
265         if not srt_lang in srt_lang_list:
266             return (u'WARNING: no closed captions found in the specified language', None)
267         params = compat_urllib_parse.urlencode({
268             'lang': srt_lang,
269             'name': srt_lang_list[srt_lang].encode('utf-8'),
270             'v': video_id,
271         })
272         url = 'http://www.youtube.com/api/timedtext?' + params
273         try:
274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277         if not srt_xml:
278             return (u'WARNING: Did not fetch video subtitles', None)
279         return (None, self._closed_captions_xml_to_srt(srt_xml))
280
281     def _print_formats(self, formats):
282         print('Available formats:')
283         for x in formats:
284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
285
286     def _real_initialize(self):
287         if self._downloader is None:
288             return
289
290         username = None
291         password = None
292         downloader_params = self._downloader.params
293
294         # Attempt to use provided username and password or .netrc data
295         if downloader_params.get('username', None) is not None:
296             username = downloader_params['username']
297             password = downloader_params['password']
298         elif downloader_params.get('usenetrc', False):
299             try:
300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
301                 if info is not None:
302                     username = info[0]
303                     password = info[2]
304                 else:
305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306             except (IOError, netrc.NetrcParseError) as err:
307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
308                 return
309
310         # Set language
311         request = compat_urllib_request.Request(self._LANG_URL)
312         try:
313             self.report_lang()
314             compat_urllib_request.urlopen(request).read()
315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
317             return
318
319         # No authentication to be performed
320         if username is None:
321             return
322
323         request = compat_urllib_request.Request(self._LOGIN_URL)
324         try:
325             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
328             return
329
330         galx = None
331         dsh = None
332         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
333         if match:
334           galx = match.group(1)
335
336         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
337         if match:
338           dsh = match.group(1)
339
340         # Log in
341         login_form_strs = {
342                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
343                 u'Email': username,
344                 u'GALX': galx,
345                 u'Passwd': password,
346                 u'PersistentCookie': u'yes',
347                 u'_utf8': u'霱',
348                 u'bgresponse': u'js_disabled',
349                 u'checkConnection': u'',
350                 u'checkedDomains': u'youtube',
351                 u'dnConn': u'',
352                 u'dsh': dsh,
353                 u'pstMsg': u'0',
354                 u'rmShown': u'1',
355                 u'secTok': u'',
356                 u'signIn': u'Sign in',
357                 u'timeStmp': u'',
358                 u'service': u'youtube',
359                 u'uilel': u'3',
360                 u'hl': u'en_US',
361         }
362         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
363         # chokes on unicode
364         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
367         try:
368             self.report_login()
369             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
372                 return
373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
375             return
376
377         # Confirm age
378         age_form = {
379                 'next_url':     '/',
380                 'action_confirm':   'Confirm',
381                 }
382         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
383         try:
384             self.report_age_confirmation()
385             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
388             return
389
390     def _extract_id(self, url):
391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
392         if mobj is None:
393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
394             return
395         video_id = mobj.group(2)
396         return video_id
397
398     def _real_extract(self, url):
399         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400         mobj = re.search(self._NEXT_URL_RE, url)
401         if mobj:
402             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403         video_id = self._extract_id(url)
404
405         # Get video webpage
406         self.report_video_webpage_download(video_id)
407         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408         request = compat_urllib_request.Request(url)
409         try:
410             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
413             return
414
415         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
416
417         # Attempt to extract SWF player URL
418         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
419         if mobj is not None:
420             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
421         else:
422             player_url = None
423
424         # Get video info
425         self.report_video_info_webpage_download(video_id)
426         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428                     % (video_id, el_type))
429             request = compat_urllib_request.Request(video_info_url)
430             try:
431                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433                 video_info = compat_parse_qs(video_info_webpage)
434                 if 'token' in video_info:
435                     break
436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
438                 return
439         if 'token' not in video_info:
440             if 'reason' in video_info:
441                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
442             else:
443                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
444             return
445
446         # Check for "rental" videos
447         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448             self._downloader.trouble(u'ERROR: "rental" videos not supported')
449             return
450
451         # Start extracting information
452         self.report_information_extraction(video_id)
453
454         # uploader
455         if 'author' not in video_info:
456             self._downloader.trouble(u'ERROR: unable to extract uploader name')
457             return
458         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
459
460         # uploader_id
461         video_uploader_id = None
462         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
463         if mobj is not None:
464             video_uploader_id = mobj.group(1)
465         else:
466             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
467
468         # title
469         if 'title' not in video_info:
470             self._downloader.trouble(u'ERROR: unable to extract video title')
471             return
472         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
473
474         # thumbnail image
475         if 'thumbnail_url' not in video_info:
476             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
477             video_thumbnail = ''
478         else:   # don't panic if we can't find it
479             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
480
481         # upload date
482         upload_date = None
483         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
484         if mobj is not None:
485             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487             for expression in format_expressions:
488                 try:
489                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
490                 except:
491                     pass
492
493         # description
494         video_description = get_element_by_id("eow-description", video_webpage)
495         if video_description:
496             video_description = clean_html(video_description)
497         else:
498             video_description = ''
499
500         # closed captions
501         video_subtitles = None
502         if self._downloader.params.get('writesubtitles', False):
503             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
504             if srt_error:
505                 self._downloader.trouble(srt_error)
506
507         if 'length_seconds' not in video_info:
508             self._downloader.trouble(u'WARNING: unable to extract video duration')
509             video_duration = ''
510         else:
511             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
512
513         # token
514         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
515
516         # Decide which formats to download
517         req_format = self._downloader.params.get('format', None)
518
519         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520             self.report_rtmp_download()
521             video_url_list = [(None, video_info['conn'][0])]
522         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
527
528             format_limit = self._downloader.params.get('format_limit', None)
529             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530             if format_limit is not None and format_limit in available_formats:
531                 format_list = available_formats[available_formats.index(format_limit):]
532             else:
533                 format_list = available_formats
534             existing_formats = [x for x in format_list if x in url_map]
535             if len(existing_formats) == 0:
536                 self._downloader.trouble(u'ERROR: no known formats available for video')
537                 return
538             if self._downloader.params.get('listformats', None):
539                 self._print_formats(existing_formats)
540                 return
541             if req_format is None or req_format == 'best':
542                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543             elif req_format == 'worst':
544                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545             elif req_format in ('-1', 'all'):
546                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
547             else:
548                 # Specific formats. We pick the first in a slash-delimeted sequence.
549                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550                 req_formats = req_format.split('/')
551                 video_url_list = None
552                 for rf in req_formats:
553                     if rf in url_map:
554                         video_url_list = [(rf, url_map[rf])]
555                         break
556                 if video_url_list is None:
557                     self._downloader.trouble(u'ERROR: requested format not available')
558                     return
559         else:
560             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
561             return
562
563         results = []
564         for format_param, video_real_url in video_url_list:
565             # Extension
566             video_extension = self._video_extensions.get(format_param, 'flv')
567
568             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569                                               self._video_dimensions.get(format_param, '???'))
570
571             results.append({
572                 'id':       video_id,
573                 'url':      video_real_url,
574                 'uploader': video_uploader,
575                 'uploader_id': video_uploader_id,
576                 'upload_date':  upload_date,
577                 'title':    video_title,
578                 'ext':      video_extension,
579                 'format':   video_format,
580                 'thumbnail':    video_thumbnail,
581                 'description':  video_description,
582                 'player_url':   player_url,
583                 'subtitles':    video_subtitles,
584                 'duration':     video_duration
585             })
586         return results
587
588
589 class MetacafeIE(InfoExtractor):
590     """Information Extractor for metacafe.com."""
591
592     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595     IE_NAME = u'metacafe'
596
597     def __init__(self, downloader=None):
598         InfoExtractor.__init__(self, downloader)
599
600     def report_disclaimer(self):
601         """Report disclaimer retrieval."""
602         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
603
604     def report_age_confirmation(self):
605         """Report attempt to confirm age."""
606         self._downloader.to_screen(u'[metacafe] Confirming age')
607
608     def report_download_webpage(self, video_id):
609         """Report webpage download."""
610         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
611
612     def report_extraction(self, video_id):
613         """Report information extraction."""
614         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
615
616     def _real_initialize(self):
617         # Retrieve disclaimer
618         request = compat_urllib_request.Request(self._DISCLAIMER)
619         try:
620             self.report_disclaimer()
621             disclaimer = compat_urllib_request.urlopen(request).read()
622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
624             return
625
626         # Confirm age
627         disclaimer_form = {
628             'filters': '0',
629             'submit': "Continue - I'm over 18",
630             }
631         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
632         try:
633             self.report_age_confirmation()
634             disclaimer = compat_urllib_request.urlopen(request).read()
635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
637             return
638
639     def _real_extract(self, url):
640         # Extract id and simplified title from URL
641         mobj = re.match(self._VALID_URL, url)
642         if mobj is None:
643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
644             return
645
646         video_id = mobj.group(1)
647
648         # Check if video comes from YouTube
649         mobj2 = re.match(r'^yt-(.*)$', video_id)
650         if mobj2 is not None:
651             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
652             return
653
654         # Retrieve video webpage to extract further information
655         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
656         try:
657             self.report_download_webpage(video_id)
658             webpage = compat_urllib_request.urlopen(request).read()
659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
661             return
662
663         # Extract URL, uploader and title from webpage
664         self.report_extraction(video_id)
665         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
666         if mobj is not None:
667             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668             video_extension = mediaURL[-3:]
669
670             # Extract gdaKey if available
671             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
672             if mobj is None:
673                 video_url = mediaURL
674             else:
675                 gdaKey = mobj.group(1)
676                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
677         else:
678             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
679             if mobj is None:
680                 self._downloader.trouble(u'ERROR: unable to extract media URL')
681                 return
682             vardict = compat_parse_qs(mobj.group(1))
683             if 'mediaData' not in vardict:
684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
685                 return
686             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
687             if mobj is None:
688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
689                 return
690             mediaURL = mobj.group(1).replace('\\/', '/')
691             video_extension = mediaURL[-3:]
692             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
693
694         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
695         if mobj is None:
696             self._downloader.trouble(u'ERROR: unable to extract title')
697             return
698         video_title = mobj.group(1).decode('utf-8')
699
700         mobj = re.search(r'submitter=(.*?);', webpage)
701         if mobj is None:
702             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
703             return
704         video_uploader = mobj.group(1)
705
706         return [{
707             'id':       video_id.decode('utf-8'),
708             'url':      video_url.decode('utf-8'),
709             'uploader': video_uploader.decode('utf-8'),
710             'upload_date':  None,
711             'title':    video_title,
712             'ext':      video_extension.decode('utf-8'),
713         }]
714
715
716 class DailymotionIE(InfoExtractor):
717     """Information Extractor for Dailymotion"""
718
719     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720     IE_NAME = u'dailymotion'
721
722     def __init__(self, downloader=None):
723         InfoExtractor.__init__(self, downloader)
724
725     def report_extraction(self, video_id):
726         """Report information extraction."""
727         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
728
729     def _real_extract(self, url):
730         # Extract id and simplified title from URL
731         mobj = re.match(self._VALID_URL, url)
732         if mobj is None:
733             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
734             return
735
736         video_id = mobj.group(1).split('_')[0].split('?')[0]
737
738         video_extension = 'mp4'
739
740         # Retrieve video webpage to extract further information
741         request = compat_urllib_request.Request(url)
742         request.add_header('Cookie', 'family_filter=off')
743         webpage = self._download_webpage(request, video_id)
744
745         # Extract URL, uploader and title from webpage
746         self.report_extraction(video_id)
747         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
748         if mobj is None:
749             self._downloader.trouble(u'ERROR: unable to extract media URL')
750             return
751         flashvars = compat_urllib_parse.unquote(mobj.group(1))
752
753         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
754             if key in flashvars:
755                 max_quality = key
756                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
757                 break
758         else:
759             self._downloader.trouble(u'ERROR: unable to extract video URL')
760             return
761
762         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
763         if mobj is None:
764             self._downloader.trouble(u'ERROR: unable to extract video URL')
765             return
766
767         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
768
769         # TODO: support choosing qualities
770
771         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
772         if mobj is None:
773             self._downloader.trouble(u'ERROR: unable to extract title')
774             return
775         video_title = unescapeHTML(mobj.group('title'))
776
777         video_uploader = None
778         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
779         if mobj is None:
780             # lookin for official user
781             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
782             if mobj_official is None:
783                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
784             else:
785                 video_uploader = mobj_official.group(1)
786         else:
787             video_uploader = mobj.group(1)
788
789         video_upload_date = None
790         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
791         if mobj is not None:
792             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
793
794         return [{
795             'id':       video_id,
796             'url':      video_url,
797             'uploader': video_uploader,
798             'upload_date':  video_upload_date,
799             'title':    video_title,
800             'ext':      video_extension,
801         }]
802
803
804 class PhotobucketIE(InfoExtractor):
805     """Information extractor for photobucket.com."""
806
807     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
808     IE_NAME = u'photobucket'
809
810     def __init__(self, downloader=None):
811         InfoExtractor.__init__(self, downloader)
812
813     def report_download_webpage(self, video_id):
814         """Report webpage download."""
815         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
816
817     def report_extraction(self, video_id):
818         """Report information extraction."""
819         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
820
821     def _real_extract(self, url):
822         # Extract id from URL
823         mobj = re.match(self._VALID_URL, url)
824         if mobj is None:
825             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
826             return
827
828         video_id = mobj.group(1)
829
830         video_extension = 'flv'
831
832         # Retrieve video webpage to extract further information
833         request = compat_urllib_request.Request(url)
834         try:
835             self.report_download_webpage(video_id)
836             webpage = compat_urllib_request.urlopen(request).read()
837         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
838             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
839             return
840
841         # Extract URL, uploader, and title from webpage
842         self.report_extraction(video_id)
843         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
844         if mobj is None:
845             self._downloader.trouble(u'ERROR: unable to extract media URL')
846             return
847         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
848
849         video_url = mediaURL
850
851         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
852         if mobj is None:
853             self._downloader.trouble(u'ERROR: unable to extract title')
854             return
855         video_title = mobj.group(1).decode('utf-8')
856
857         video_uploader = mobj.group(2).decode('utf-8')
858
859         return [{
860             'id':       video_id.decode('utf-8'),
861             'url':      video_url.decode('utf-8'),
862             'uploader': video_uploader,
863             'upload_date':  None,
864             'title':    video_title,
865             'ext':      video_extension.decode('utf-8'),
866         }]
867
868
869 class YahooIE(InfoExtractor):
870     """Information extractor for video.yahoo.com."""
871
872     _WORKING = False
873     # _VALID_URL matches all Yahoo! Video URLs
874     # _VPAGE_URL matches only the extractable '/watch/' URLs
875     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
876     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
877     IE_NAME = u'video.yahoo'
878
879     def __init__(self, downloader=None):
880         InfoExtractor.__init__(self, downloader)
881
882     def report_download_webpage(self, video_id):
883         """Report webpage download."""
884         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
885
886     def report_extraction(self, video_id):
887         """Report information extraction."""
888         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
889
890     def _real_extract(self, url, new_video=True):
891         # Extract ID from URL
892         mobj = re.match(self._VALID_URL, url)
893         if mobj is None:
894             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
895             return
896
897         video_id = mobj.group(2)
898         video_extension = 'flv'
899
900         # Rewrite valid but non-extractable URLs as
901         # extractable English language /watch/ URLs
902         if re.match(self._VPAGE_URL, url) is None:
903             request = compat_urllib_request.Request(url)
904             try:
905                 webpage = compat_urllib_request.urlopen(request).read()
906             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
907                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
908                 return
909
910             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
911             if mobj is None:
912                 self._downloader.trouble(u'ERROR: Unable to extract id field')
913                 return
914             yahoo_id = mobj.group(1)
915
916             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
917             if mobj is None:
918                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
919                 return
920             yahoo_vid = mobj.group(1)
921
922             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
923             return self._real_extract(url, new_video=False)
924
925         # Retrieve video webpage to extract further information
926         request = compat_urllib_request.Request(url)
927         try:
928             self.report_download_webpage(video_id)
929             webpage = compat_urllib_request.urlopen(request).read()
930         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
931             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
932             return
933
934         # Extract uploader and title from webpage
935         self.report_extraction(video_id)
936         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
937         if mobj is None:
938             self._downloader.trouble(u'ERROR: unable to extract video title')
939             return
940         video_title = mobj.group(1).decode('utf-8')
941
942         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
943         if mobj is None:
944             self._downloader.trouble(u'ERROR: unable to extract video uploader')
945             return
946         video_uploader = mobj.group(1).decode('utf-8')
947
948         # Extract video thumbnail
949         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
950         if mobj is None:
951             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
952             return
953         video_thumbnail = mobj.group(1).decode('utf-8')
954
955         # Extract video description
956         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
957         if mobj is None:
958             self._downloader.trouble(u'ERROR: unable to extract video description')
959             return
960         video_description = mobj.group(1).decode('utf-8')
961         if not video_description:
962             video_description = 'No description available.'
963
964         # Extract video height and width
965         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
966         if mobj is None:
967             self._downloader.trouble(u'ERROR: unable to extract video height')
968             return
969         yv_video_height = mobj.group(1)
970
971         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
972         if mobj is None:
973             self._downloader.trouble(u'ERROR: unable to extract video width')
974             return
975         yv_video_width = mobj.group(1)
976
977         # Retrieve video playlist to extract media URL
978         # I'm not completely sure what all these options are, but we
979         # seem to need most of them, otherwise the server sends a 401.
980         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
981         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
982         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
983                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
984                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
985         try:
986             self.report_download_webpage(video_id)
987             webpage = compat_urllib_request.urlopen(request).read()
988         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
989             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
990             return
991
992         # Extract media URL from playlist XML
993         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
994         if mobj is None:
995             self._downloader.trouble(u'ERROR: Unable to extract media URL')
996             return
997         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
998         video_url = unescapeHTML(video_url)
999
1000         return [{
1001             'id':       video_id.decode('utf-8'),
1002             'url':      video_url,
1003             'uploader': video_uploader,
1004             'upload_date':  None,
1005             'title':    video_title,
1006             'ext':      video_extension.decode('utf-8'),
1007             'thumbnail':    video_thumbnail.decode('utf-8'),
1008             'description':  video_description,
1009         }]
1010
1011
1012 class VimeoIE(InfoExtractor):
1013     """Information extractor for vimeo.com."""
1014
1015     # _VALID_URL matches Vimeo URLs
1016     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1017     IE_NAME = u'vimeo'
1018
1019     def __init__(self, downloader=None):
1020         InfoExtractor.__init__(self, downloader)
1021
1022     def report_download_webpage(self, video_id):
1023         """Report webpage download."""
1024         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1025
1026     def report_extraction(self, video_id):
1027         """Report information extraction."""
1028         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1029
1030     def _real_extract(self, url, new_video=True):
1031         # Extract ID from URL
1032         mobj = re.match(self._VALID_URL, url)
1033         if mobj is None:
1034             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1035             return
1036
1037         video_id = mobj.group('id')
1038         if not mobj.group('proto'):
1039             url = 'https://' + url
1040         if mobj.group('direct_link'):
1041             url = 'https://vimeo.com/' + video_id
1042
1043         # Retrieve video webpage to extract further information
1044         request = compat_urllib_request.Request(url, None, std_headers)
1045         try:
1046             self.report_download_webpage(video_id)
1047             webpage_bytes = compat_urllib_request.urlopen(request).read()
1048             webpage = webpage_bytes.decode('utf-8')
1049         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1050             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1051             return
1052
1053         # Now we begin extracting as much information as we can from what we
1054         # retrieved. First we extract the information common to all extractors,
1055         # and latter we extract those that are Vimeo specific.
1056         self.report_extraction(video_id)
1057
1058         # Extract the config JSON
1059         try:
1060             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061             config = json.loads(config)
1062         except:
1063             self._downloader.trouble(u'ERROR: unable to extract info section')
1064             return
1065
1066         # Extract title
1067         video_title = config["video"]["title"]
1068
1069         # Extract uploader and uploader_id
1070         video_uploader = config["video"]["owner"]["name"]
1071         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1072
1073         # Extract video thumbnail
1074         video_thumbnail = config["video"]["thumbnail"]
1075
1076         # Extract video description
1077         video_description = get_element_by_attribute("itemprop", "description", webpage)
1078         if video_description: video_description = clean_html(video_description)
1079         else: video_description = ''
1080
1081         # Extract upload date
1082         video_upload_date = None
1083         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1084         if mobj is not None:
1085             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1086
1087         # Vimeo specific: extract request signature and timestamp
1088         sig = config['request']['signature']
1089         timestamp = config['request']['timestamp']
1090
1091         # Vimeo specific: extract video codec and quality information
1092         # First consider quality, then codecs, then take everything
1093         # TODO bind to format param
1094         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1095         files = { 'hd': [], 'sd': [], 'other': []}
1096         for codec_name, codec_extension in codecs:
1097             if codec_name in config["video"]["files"]:
1098                 if 'hd' in config["video"]["files"][codec_name]:
1099                     files['hd'].append((codec_name, codec_extension, 'hd'))
1100                 elif 'sd' in config["video"]["files"][codec_name]:
1101                     files['sd'].append((codec_name, codec_extension, 'sd'))
1102                 else:
1103                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1104
1105         for quality in ('hd', 'sd', 'other'):
1106             if len(files[quality]) > 0:
1107                 video_quality = files[quality][0][2]
1108                 video_codec = files[quality][0][0]
1109                 video_extension = files[quality][0][1]
1110                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1111                 break
1112         else:
1113             self._downloader.trouble(u'ERROR: no known codec found')
1114             return
1115
1116         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1117                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1118
1119         return [{
1120             'id':       video_id,
1121             'url':      video_url,
1122             'uploader': video_uploader,
1123             'uploader_id': video_uploader_id,
1124             'upload_date':  video_upload_date,
1125             'title':    video_title,
1126             'ext':      video_extension,
1127             'thumbnail':    video_thumbnail,
1128             'description':  video_description,
1129         }]
1130
1131
1132 class ArteTvIE(InfoExtractor):
1133     """arte.tv information extractor."""
1134
1135     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1136     _LIVE_URL = r'index-[0-9]+\.html$'
1137
1138     IE_NAME = u'arte.tv'
1139
1140     def __init__(self, downloader=None):
1141         InfoExtractor.__init__(self, downloader)
1142
1143     def report_download_webpage(self, video_id):
1144         """Report webpage download."""
1145         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1146
1147     def report_extraction(self, video_id):
1148         """Report information extraction."""
1149         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1150
1151     def fetch_webpage(self, url):
1152         request = compat_urllib_request.Request(url)
1153         try:
1154             self.report_download_webpage(url)
1155             webpage = compat_urllib_request.urlopen(request).read()
1156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1157             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1158             return
1159         except ValueError as err:
1160             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1161             return
1162         return webpage
1163
1164     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1165         page = self.fetch_webpage(url)
1166         mobj = re.search(regex, page, regexFlags)
1167         info = {}
1168
1169         if mobj is None:
1170             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1171             return
1172
1173         for (i, key, err) in matchTuples:
1174             if mobj.group(i) is None:
1175                 self._downloader.trouble(err)
1176                 return
1177             else:
1178                 info[key] = mobj.group(i)
1179
1180         return info
1181
1182     def extractLiveStream(self, url):
1183         video_lang = url.split('/')[-4]
1184         info = self.grep_webpage(
1185             url,
1186             r'src="(.*?/videothek_js.*?\.js)',
1187             0,
1188             [
1189                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1190             ]
1191         )
1192         http_host = url.split('/')[2]
1193         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1194         info = self.grep_webpage(
1195             next_url,
1196             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1197                 '(http://.*?\.swf).*?' +
1198                 '(rtmp://.*?)\'',
1199             re.DOTALL,
1200             [
1201                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1202                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1203                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1204             ]
1205         )
1206         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1207
1208     def extractPlus7Stream(self, url):
1209         video_lang = url.split('/')[-3]
1210         info = self.grep_webpage(
1211             url,
1212             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1213             0,
1214             [
1215                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1216             ]
1217         )
1218         next_url = compat_urllib_parse.unquote(info.get('url'))
1219         info = self.grep_webpage(
1220             next_url,
1221             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1222             0,
1223             [
1224                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1225             ]
1226         )
1227         next_url = compat_urllib_parse.unquote(info.get('url'))
1228
1229         info = self.grep_webpage(
1230             next_url,
1231             r'<video id="(.*?)".*?>.*?' +
1232                 '<name>(.*?)</name>.*?' +
1233                 '<dateVideo>(.*?)</dateVideo>.*?' +
1234                 '<url quality="hd">(.*?)</url>',
1235             re.DOTALL,
1236             [
1237                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1238                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1239                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1240                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1241             ]
1242         )
1243
1244         return {
1245             'id':           info.get('id'),
1246             'url':          compat_urllib_parse.unquote(info.get('url')),
1247             'uploader':     u'arte.tv',
1248             'upload_date':  info.get('date'),
1249             'title':        info.get('title').decode('utf-8'),
1250             'ext':          u'mp4',
1251             'format':       u'NA',
1252             'player_url':   None,
1253         }
1254
1255     def _real_extract(self, url):
1256         video_id = url.split('/')[-1]
1257         self.report_extraction(video_id)
1258
1259         if re.search(self._LIVE_URL, video_id) is not None:
1260             self.extractLiveStream(url)
1261             return
1262         else:
1263             info = self.extractPlus7Stream(url)
1264
1265         return [info]
1266
1267
1268 class GenericIE(InfoExtractor):
1269     """Generic last-resort information extractor."""
1270
1271     _VALID_URL = r'.*'
1272     IE_NAME = u'generic'
1273
1274     def __init__(self, downloader=None):
1275         InfoExtractor.__init__(self, downloader)
1276
1277     def report_download_webpage(self, video_id):
1278         """Report webpage download."""
1279         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1280         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1281
1282     def report_extraction(self, video_id):
1283         """Report information extraction."""
1284         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1285
1286     def report_following_redirect(self, new_url):
1287         """Report information extraction."""
1288         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1289
1290     def _test_redirect(self, url):
1291         """Check if it is a redirect, like url shorteners, in case restart chain."""
1292         class HeadRequest(compat_urllib_request.Request):
1293             def get_method(self):
1294                 return "HEAD"
1295
1296         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1297             """
1298             Subclass the HTTPRedirectHandler to make it use our
1299             HeadRequest also on the redirected URL
1300             """
1301             def redirect_request(self, req, fp, code, msg, headers, newurl):
1302                 if code in (301, 302, 303, 307):
1303                     newurl = newurl.replace(' ', '%20')
1304                     newheaders = dict((k,v) for k,v in req.headers.items()
1305                                       if k.lower() not in ("content-length", "content-type"))
1306                     return HeadRequest(newurl,
1307                                        headers=newheaders,
1308                                        origin_req_host=req.get_origin_req_host(),
1309                                        unverifiable=True)
1310                 else:
1311                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1312
1313         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1314             """
1315             Fallback to GET if HEAD is not allowed (405 HTTP error)
1316             """
1317             def http_error_405(self, req, fp, code, msg, headers):
1318                 fp.read()
1319                 fp.close()
1320
1321                 newheaders = dict((k,v) for k,v in req.headers.items()
1322                                   if k.lower() not in ("content-length", "content-type"))
1323                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1324                                                  headers=newheaders,
1325                                                  origin_req_host=req.get_origin_req_host(),
1326                                                  unverifiable=True))
1327
1328         # Build our opener
1329         opener = compat_urllib_request.OpenerDirector()
1330         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1331                         HTTPMethodFallback, HEADRedirectHandler,
1332                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1333             opener.add_handler(handler())
1334
1335         response = opener.open(HeadRequest(url))
1336         new_url = response.geturl()
1337
1338         if url == new_url:
1339             return False
1340
1341         self.report_following_redirect(new_url)
1342         self._downloader.download([new_url])
1343         return True
1344
1345     def _real_extract(self, url):
1346         if self._test_redirect(url): return
1347
1348         video_id = url.split('/')[-1]
1349         request = compat_urllib_request.Request(url)
1350         try:
1351             self.report_download_webpage(video_id)
1352             webpage = compat_urllib_request.urlopen(request).read()
1353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1354             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1355             return
1356         except ValueError as err:
1357             # since this is the last-resort InfoExtractor, if
1358             # this error is thrown, it'll be thrown here
1359             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1360             return
1361
1362         self.report_extraction(video_id)
1363         # Start with something easy: JW Player in SWFObject
1364         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1365         if mobj is None:
1366             # Broaden the search a little bit
1367             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1368         if mobj is None:
1369             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1370             return
1371
1372         # It's possible that one of the regexes
1373         # matched, but returned an empty group:
1374         if mobj.group(1) is None:
1375             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376             return
1377
1378         video_url = compat_urllib_parse.unquote(mobj.group(1))
1379         video_id = os.path.basename(video_url)
1380
1381         # here's a fun little line of code for you:
1382         video_extension = os.path.splitext(video_id)[1][1:]
1383         video_id = os.path.splitext(video_id)[0]
1384
1385         # it's tempting to parse this further, but you would
1386         # have to take into account all the variations like
1387         #   Video Title - Site Name
1388         #   Site Name | Video Title
1389         #   Video Title - Tagline | Site Name
1390         # and so on and so forth; it's just not practical
1391         mobj = re.search(r'<title>(.*)</title>', webpage)
1392         if mobj is None:
1393             self._downloader.trouble(u'ERROR: unable to extract title')
1394             return
1395         video_title = mobj.group(1)
1396
1397         # video uploader is domain name
1398         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1399         if mobj is None:
1400             self._downloader.trouble(u'ERROR: unable to extract title')
1401             return
1402         video_uploader = mobj.group(1)
1403
1404         return [{
1405             'id':       video_id,
1406             'url':      video_url,
1407             'uploader': video_uploader,
1408             'upload_date':  None,
1409             'title':    video_title,
1410             'ext':      video_extension,
1411         }]
1412
1413
1414 class YoutubeSearchIE(InfoExtractor):
1415     """Information Extractor for YouTube search queries."""
1416     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1417     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1418     _max_youtube_results = 1000
1419     IE_NAME = u'youtube:search'
1420
1421     def __init__(self, downloader=None):
1422         InfoExtractor.__init__(self, downloader)
1423
1424     def report_download_page(self, query, pagenum):
1425         """Report attempt to download search page with given number."""
1426         query = query.decode(preferredencoding())
1427         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1428
1429     def _real_extract(self, query):
1430         mobj = re.match(self._VALID_URL, query)
1431         if mobj is None:
1432             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1433             return
1434
1435         prefix, query = query.split(':')
1436         prefix = prefix[8:]
1437         query = query.encode('utf-8')
1438         if prefix == '':
1439             self._download_n_results(query, 1)
1440             return
1441         elif prefix == 'all':
1442             self._download_n_results(query, self._max_youtube_results)
1443             return
1444         else:
1445             try:
1446                 n = int(prefix)
1447                 if n <= 0:
1448                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1449                     return
1450                 elif n > self._max_youtube_results:
1451                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1452                     n = self._max_youtube_results
1453                 self._download_n_results(query, n)
1454                 return
1455             except ValueError: # parsing prefix as integer fails
1456                 self._download_n_results(query, 1)
1457                 return
1458
1459     def _download_n_results(self, query, n):
1460         """Downloads a specified number of results for a query"""
1461
1462         video_ids = []
1463         pagenum = 0
1464         limit = n
1465
1466         while (50 * pagenum) < limit:
1467             self.report_download_page(query, pagenum+1)
1468             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1469             request = compat_urllib_request.Request(result_url)
1470             try:
1471                 data = compat_urllib_request.urlopen(request).read()
1472             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1473                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1474                 return
1475             api_response = json.loads(data)['data']
1476
1477             new_ids = list(video['id'] for video in api_response['items'])
1478             video_ids += new_ids
1479
1480             limit = min(n, api_response['totalItems'])
1481             pagenum += 1
1482
1483         if len(video_ids) > n:
1484             video_ids = video_ids[:n]
1485         for id in video_ids:
1486             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1487         return
1488
1489
1490 class GoogleSearchIE(InfoExtractor):
1491     """Information Extractor for Google Video search queries."""
1492     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1493     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1494     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1495     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1496     _max_google_results = 1000
1497     IE_NAME = u'video.google:search'
1498
1499     def __init__(self, downloader=None):
1500         InfoExtractor.__init__(self, downloader)
1501
1502     def report_download_page(self, query, pagenum):
1503         """Report attempt to download playlist page with given number."""
1504         query = query.decode(preferredencoding())
1505         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1506
1507     def _real_extract(self, query):
1508         mobj = re.match(self._VALID_URL, query)
1509         if mobj is None:
1510             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1511             return
1512
1513         prefix, query = query.split(':')
1514         prefix = prefix[8:]
1515         query = query.encode('utf-8')
1516         if prefix == '':
1517             self._download_n_results(query, 1)
1518             return
1519         elif prefix == 'all':
1520             self._download_n_results(query, self._max_google_results)
1521             return
1522         else:
1523             try:
1524                 n = int(prefix)
1525                 if n <= 0:
1526                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1527                     return
1528                 elif n > self._max_google_results:
1529                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1530                     n = self._max_google_results
1531                 self._download_n_results(query, n)
1532                 return
1533             except ValueError: # parsing prefix as integer fails
1534                 self._download_n_results(query, 1)
1535                 return
1536
1537     def _download_n_results(self, query, n):
1538         """Downloads a specified number of results for a query"""
1539
1540         video_ids = []
1541         pagenum = 0
1542
1543         while True:
1544             self.report_download_page(query, pagenum)
1545             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1546             request = compat_urllib_request.Request(result_url)
1547             try:
1548                 page = compat_urllib_request.urlopen(request).read()
1549             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1550                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1551                 return
1552
1553             # Extract video identifiers
1554             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1555                 video_id = mobj.group(1)
1556                 if video_id not in video_ids:
1557                     video_ids.append(video_id)
1558                     if len(video_ids) == n:
1559                         # Specified n videos reached
1560                         for id in video_ids:
1561                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1562                         return
1563
1564             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1565                 for id in video_ids:
1566                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1567                 return
1568
1569             pagenum = pagenum + 1
1570
1571
1572 class YahooSearchIE(InfoExtractor):
1573     """Information Extractor for Yahoo! Video search queries."""
1574
1575     _WORKING = False
1576     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1577     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1578     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1579     _MORE_PAGES_INDICATOR = r'\s*Next'
1580     _max_yahoo_results = 1000
1581     IE_NAME = u'video.yahoo:search'
1582
1583     def __init__(self, downloader=None):
1584         InfoExtractor.__init__(self, downloader)
1585
1586     def report_download_page(self, query, pagenum):
1587         """Report attempt to download playlist page with given number."""
1588         query = query.decode(preferredencoding())
1589         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1590
1591     def _real_extract(self, query):
1592         mobj = re.match(self._VALID_URL, query)
1593         if mobj is None:
1594             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1595             return
1596
1597         prefix, query = query.split(':')
1598         prefix = prefix[8:]
1599         query = query.encode('utf-8')
1600         if prefix == '':
1601             self._download_n_results(query, 1)
1602             return
1603         elif prefix == 'all':
1604             self._download_n_results(query, self._max_yahoo_results)
1605             return
1606         else:
1607             try:
1608                 n = int(prefix)
1609                 if n <= 0:
1610                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1611                     return
1612                 elif n > self._max_yahoo_results:
1613                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1614                     n = self._max_yahoo_results
1615                 self._download_n_results(query, n)
1616                 return
1617             except ValueError: # parsing prefix as integer fails
1618                 self._download_n_results(query, 1)
1619                 return
1620
1621     def _download_n_results(self, query, n):
1622         """Downloads a specified number of results for a query"""
1623
1624         video_ids = []
1625         already_seen = set()
1626         pagenum = 1
1627
1628         while True:
1629             self.report_download_page(query, pagenum)
1630             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1631             request = compat_urllib_request.Request(result_url)
1632             try:
1633                 page = compat_urllib_request.urlopen(request).read()
1634             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1635                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1636                 return
1637
1638             # Extract video identifiers
1639             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1640                 video_id = mobj.group(1)
1641                 if video_id not in already_seen:
1642                     video_ids.append(video_id)
1643                     already_seen.add(video_id)
1644                     if len(video_ids) == n:
1645                         # Specified n videos reached
1646                         for id in video_ids:
1647                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1648                         return
1649
1650             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1651                 for id in video_ids:
1652                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1653                 return
1654
1655             pagenum = pagenum + 1
1656
1657
1658 class YoutubePlaylistIE(InfoExtractor):
1659     """Information Extractor for YouTube playlists."""
1660
1661     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1662     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1663     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1664     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1665     IE_NAME = u'youtube:playlist'
1666
1667     def __init__(self, downloader=None):
1668         InfoExtractor.__init__(self, downloader)
1669
1670     def report_download_page(self, playlist_id, pagenum):
1671         """Report attempt to download playlist page with given number."""
1672         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1673
1674     def _real_extract(self, url):
1675         # Extract playlist id
1676         mobj = re.match(self._VALID_URL, url)
1677         if mobj is None:
1678             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1679             return
1680
1681         # Single video case
1682         if mobj.group(3) is not None:
1683             self._downloader.download([mobj.group(3)])
1684             return
1685
1686         # Download playlist pages
1687         # prefix is 'p' as default for playlists but there are other types that need extra care
1688         playlist_prefix = mobj.group(1)
1689         if playlist_prefix == 'a':
1690             playlist_access = 'artist'
1691         else:
1692             playlist_prefix = 'p'
1693             playlist_access = 'view_play_list'
1694         playlist_id = mobj.group(2)
1695         video_ids = []
1696         pagenum = 1
1697
1698         while True:
1699             self.report_download_page(playlist_id, pagenum)
1700             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1701             request = compat_urllib_request.Request(url)
1702             try:
1703                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1704             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1705                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1706                 return
1707
1708             # Extract video identifiers
1709             ids_in_page = []
1710             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1711                 if mobj.group(1) not in ids_in_page:
1712                     ids_in_page.append(mobj.group(1))
1713             video_ids.extend(ids_in_page)
1714
1715             if self._MORE_PAGES_INDICATOR not in page:
1716                 break
1717             pagenum = pagenum + 1
1718
1719         total = len(video_ids)
1720
1721         playliststart = self._downloader.params.get('playliststart', 1) - 1
1722         playlistend = self._downloader.params.get('playlistend', -1)
1723         if playlistend == -1:
1724             video_ids = video_ids[playliststart:]
1725         else:
1726             video_ids = video_ids[playliststart:playlistend]
1727
1728         if len(video_ids) == total:
1729             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1730         else:
1731             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1732
1733         for id in video_ids:
1734             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1735         return
1736
1737
1738 class YoutubeChannelIE(InfoExtractor):
1739     """Information Extractor for YouTube channels."""
1740
1741     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1742     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1743     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1744     IE_NAME = u'youtube:channel'
1745
1746     def report_download_page(self, channel_id, pagenum):
1747         """Report attempt to download channel page with given number."""
1748         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1749
1750     def _real_extract(self, url):
1751         # Extract channel id
1752         mobj = re.match(self._VALID_URL, url)
1753         if mobj is None:
1754             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1755             return
1756
1757         # Download channel pages
1758         channel_id = mobj.group(1)
1759         video_ids = []
1760         pagenum = 1
1761
1762         while True:
1763             self.report_download_page(channel_id, pagenum)
1764             url = self._TEMPLATE_URL % (channel_id, pagenum)
1765             request = compat_urllib_request.Request(url)
1766             try:
1767                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1768             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1770                 return
1771
1772             # Extract video identifiers
1773             ids_in_page = []
1774             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1775                 if mobj.group(1) not in ids_in_page:
1776                     ids_in_page.append(mobj.group(1))
1777             video_ids.extend(ids_in_page)
1778
1779             if self._MORE_PAGES_INDICATOR not in page:
1780                 break
1781             pagenum = pagenum + 1
1782
1783         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1784
1785         for id in video_ids:
1786             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1787         return
1788
1789
1790 class YoutubeUserIE(InfoExtractor):
1791     """Information Extractor for YouTube users."""
1792
1793     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1794     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1795     _GDATA_PAGE_SIZE = 50
1796     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1797     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1798     IE_NAME = u'youtube:user'
1799
1800     def __init__(self, downloader=None):
1801         InfoExtractor.__init__(self, downloader)
1802
1803     def report_download_page(self, username, start_index):
1804         """Report attempt to download user page."""
1805         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1806                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1807
1808     def _real_extract(self, url):
1809         # Extract username
1810         mobj = re.match(self._VALID_URL, url)
1811         if mobj is None:
1812             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1813             return
1814
1815         username = mobj.group(1)
1816
1817         # Download video ids using YouTube Data API. Result size per
1818         # query is limited (currently to 50 videos) so we need to query
1819         # page by page until there are no video ids - it means we got
1820         # all of them.
1821
1822         video_ids = []
1823         pagenum = 0
1824
1825         while True:
1826             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1827             self.report_download_page(username, start_index)
1828
1829             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1830
1831             try:
1832                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1833             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1835                 return
1836
1837             # Extract video identifiers
1838             ids_in_page = []
1839
1840             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1841                 if mobj.group(1) not in ids_in_page:
1842                     ids_in_page.append(mobj.group(1))
1843
1844             video_ids.extend(ids_in_page)
1845
1846             # A little optimization - if current page is not
1847             # "full", ie. does not contain PAGE_SIZE video ids then
1848             # we can assume that this page is the last one - there
1849             # are no more ids on further pages - no need to query
1850             # again.
1851
1852             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1853                 break
1854
1855             pagenum += 1
1856
1857         all_ids_count = len(video_ids)
1858         playliststart = self._downloader.params.get('playliststart', 1) - 1
1859         playlistend = self._downloader.params.get('playlistend', -1)
1860
1861         if playlistend == -1:
1862             video_ids = video_ids[playliststart:]
1863         else:
1864             video_ids = video_ids[playliststart:playlistend]
1865
1866         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1867                 (username, all_ids_count, len(video_ids)))
1868
1869         for video_id in video_ids:
1870             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1871
1872
1873 class BlipTVUserIE(InfoExtractor):
1874     """Information Extractor for blip.tv users."""
1875
1876     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1877     _PAGE_SIZE = 12
1878     IE_NAME = u'blip.tv:user'
1879
1880     def __init__(self, downloader=None):
1881         InfoExtractor.__init__(self, downloader)
1882
1883     def report_download_page(self, username, pagenum):
1884         """Report attempt to download user page."""
1885         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1886                 (self.IE_NAME, username, pagenum))
1887
1888     def _real_extract(self, url):
1889         # Extract username
1890         mobj = re.match(self._VALID_URL, url)
1891         if mobj is None:
1892             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1893             return
1894
1895         username = mobj.group(1)
1896
1897         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1898
1899         request = compat_urllib_request.Request(url)
1900
1901         try:
1902             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1903             mobj = re.search(r'data-users-id="([^"]+)"', page)
1904             page_base = page_base % mobj.group(1)
1905         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1906             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1907             return
1908
1909
1910         # Download video ids using BlipTV Ajax calls. Result size per
1911         # query is limited (currently to 12 videos) so we need to query
1912         # page by page until there are no video ids - it means we got
1913         # all of them.
1914
1915         video_ids = []
1916         pagenum = 1
1917
1918         while True:
1919             self.report_download_page(username, pagenum)
1920
1921             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1922
1923             try:
1924                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1926                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1927                 return
1928
1929             # Extract video identifiers
1930             ids_in_page = []
1931
1932             for mobj in re.finditer(r'href="/([^"]+)"', page):
1933                 if mobj.group(1) not in ids_in_page:
1934                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1935
1936             video_ids.extend(ids_in_page)
1937
1938             # A little optimization - if current page is not
1939             # "full", ie. does not contain PAGE_SIZE video ids then
1940             # we can assume that this page is the last one - there
1941             # are no more ids on further pages - no need to query
1942             # again.
1943
1944             if len(ids_in_page) < self._PAGE_SIZE:
1945                 break
1946
1947             pagenum += 1
1948
1949         all_ids_count = len(video_ids)
1950         playliststart = self._downloader.params.get('playliststart', 1) - 1
1951         playlistend = self._downloader.params.get('playlistend', -1)
1952
1953         if playlistend == -1:
1954             video_ids = video_ids[playliststart:]
1955         else:
1956             video_ids = video_ids[playliststart:playlistend]
1957
1958         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1959                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1960
1961         for video_id in video_ids:
1962             self._downloader.download([u'http://blip.tv/'+video_id])
1963
1964
1965 class DepositFilesIE(InfoExtractor):
1966     """Information extractor for depositfiles.com"""
1967
1968     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1969
1970     def report_download_webpage(self, file_id):
1971         """Report webpage download."""
1972         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1973
1974     def report_extraction(self, file_id):
1975         """Report information extraction."""
1976         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1977
1978     def _real_extract(self, url):
1979         file_id = url.split('/')[-1]
1980         # Rebuild url in english locale
1981         url = 'http://depositfiles.com/en/files/' + file_id
1982
1983         # Retrieve file webpage with 'Free download' button pressed
1984         free_download_indication = { 'gateway_result' : '1' }
1985         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1986         try:
1987             self.report_download_webpage(file_id)
1988             webpage = compat_urllib_request.urlopen(request).read()
1989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1991             return
1992
1993         # Search for the real file URL
1994         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1995         if (mobj is None) or (mobj.group(1) is None):
1996             # Try to figure out reason of the error.
1997             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1998             if (mobj is not None) and (mobj.group(1) is not None):
1999                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2000                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2001             else:
2002                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2003             return
2004
2005         file_url = mobj.group(1)
2006         file_extension = os.path.splitext(file_url)[1][1:]
2007
2008         # Search for file title
2009         mobj = re.search(r'<b title="(.*?)">', webpage)
2010         if mobj is None:
2011             self._downloader.trouble(u'ERROR: unable to extract title')
2012             return
2013         file_title = mobj.group(1).decode('utf-8')
2014
2015         return [{
2016             'id':       file_id.decode('utf-8'),
2017             'url':      file_url.decode('utf-8'),
2018             'uploader': None,
2019             'upload_date':  None,
2020             'title':    file_title,
2021             'ext':      file_extension.decode('utf-8'),
2022         }]
2023
2024
2025 class FacebookIE(InfoExtractor):
2026     """Information Extractor for Facebook"""
2027
2028     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2029     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2030     _NETRC_MACHINE = 'facebook'
2031     IE_NAME = u'facebook'
2032
2033     def report_login(self):
2034         """Report attempt to log in."""
2035         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2036
2037     def _real_initialize(self):
2038         if self._downloader is None:
2039             return
2040
2041         useremail = None
2042         password = None
2043         downloader_params = self._downloader.params
2044
2045         # Attempt to use provided username and password or .netrc data
2046         if downloader_params.get('username', None) is not None:
2047             useremail = downloader_params['username']
2048             password = downloader_params['password']
2049         elif downloader_params.get('usenetrc', False):
2050             try:
2051                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2052                 if info is not None:
2053                     useremail = info[0]
2054                     password = info[2]
2055                 else:
2056                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2057             except (IOError, netrc.NetrcParseError) as err:
2058                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2059                 return
2060
2061         if useremail is None:
2062             return
2063
2064         # Log in
2065         login_form = {
2066             'email': useremail,
2067             'pass': password,
2068             'login': 'Log+In'
2069             }
2070         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2071         try:
2072             self.report_login()
2073             login_results = compat_urllib_request.urlopen(request).read()
2074             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2075                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2076                 return
2077         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2078             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2079             return
2080
2081     def _real_extract(self, url):
2082         mobj = re.match(self._VALID_URL, url)
2083         if mobj is None:
2084             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2085             return
2086         video_id = mobj.group('ID')
2087
2088         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2089         webpage = self._download_webpage(url, video_id)
2090
2091         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2092         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2093         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2094         if not m:
2095             raise ExtractorError(u'Cannot parse data')
2096         data = dict(json.loads(m.group(1)))
2097         params_raw = compat_urllib_parse.unquote(data['params'])
2098         params = json.loads(params_raw)
2099         video_url = params['hd_src']
2100         video_duration = int(params['video_duration'])
2101
2102         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2103         if not m:
2104             raise ExtractorError(u'Cannot find title in webpage')
2105         video_title = unescapeHTML(m.group(1))
2106
2107         info = {
2108             'id': video_id,
2109             'title': video_title,
2110             'url': video_url,
2111             'ext': 'mp4',
2112             'duration': video_duration,
2113             'thumbnail': params['thumbnail_src'],
2114         }
2115         return [info]
2116
2117
2118 class BlipTVIE(InfoExtractor):
2119     """Information extractor for blip.tv"""
2120
2121     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2122     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2123     IE_NAME = u'blip.tv'
2124
2125     def report_extraction(self, file_id):
2126         """Report information extraction."""
2127         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2128
2129     def report_direct_download(self, title):
2130         """Report information extraction."""
2131         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2132
2133     def _real_extract(self, url):
2134         mobj = re.match(self._VALID_URL, url)
2135         if mobj is None:
2136             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2137             return
2138
2139         if '?' in url:
2140             cchar = '&'
2141         else:
2142             cchar = '?'
2143         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2144         request = compat_urllib_request.Request(json_url)
2145         request.add_header('User-Agent', 'iTunes/10.6.1')
2146         self.report_extraction(mobj.group(1))
2147         info = None
2148         try:
2149             urlh = compat_urllib_request.urlopen(request)
2150             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2151                 basename = url.split('/')[-1]
2152                 title,ext = os.path.splitext(basename)
2153                 title = title.decode('UTF-8')
2154                 ext = ext.replace('.', '')
2155                 self.report_direct_download(title)
2156                 info = {
2157                     'id': title,
2158                     'url': url,
2159                     'uploader': None,
2160                     'upload_date': None,
2161                     'title': title,
2162                     'ext': ext,
2163                     'urlhandle': urlh
2164                 }
2165         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2166             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2167         if info is None: # Regular URL
2168             try:
2169                 json_code_bytes = urlh.read()
2170                 json_code = json_code_bytes.decode('utf-8')
2171             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2173                 return
2174
2175             try:
2176                 json_data = json.loads(json_code)
2177                 if 'Post' in json_data:
2178                     data = json_data['Post']
2179                 else:
2180                     data = json_data
2181
2182                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2183                 video_url = data['media']['url']
2184                 umobj = re.match(self._URL_EXT, video_url)
2185                 if umobj is None:
2186                     raise ValueError('Can not determine filename extension')
2187                 ext = umobj.group(1)
2188
2189                 info = {
2190                     'id': data['item_id'],
2191                     'url': video_url,
2192                     'uploader': data['display_name'],
2193                     'upload_date': upload_date,
2194                     'title': data['title'],
2195                     'ext': ext,
2196                     'format': data['media']['mimeType'],
2197                     'thumbnail': data['thumbnailUrl'],
2198                     'description': data['description'],
2199                     'player_url': data['embedUrl'],
2200                     'user_agent': 'iTunes/10.6.1',
2201                 }
2202             except (ValueError,KeyError) as err:
2203                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2204                 return
2205
2206         return [info]
2207
2208
2209 class MyVideoIE(InfoExtractor):
2210     """Information Extractor for myvideo.de."""
2211
2212     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2213     IE_NAME = u'myvideo'
2214
2215     def __init__(self, downloader=None):
2216         InfoExtractor.__init__(self, downloader)
2217
2218     def report_extraction(self, video_id):
2219         """Report information extraction."""
2220         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2221
2222     def _real_extract(self,url):
2223         mobj = re.match(self._VALID_URL, url)
2224         if mobj is None:
2225             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2226             return
2227
2228         video_id = mobj.group(1)
2229
2230         # Get video webpage
2231         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2232         webpage = self._download_webpage(webpage_url, video_id)
2233
2234         self.report_extraction(video_id)
2235         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2236                  webpage)
2237         if mobj is None:
2238             self._downloader.trouble(u'ERROR: unable to extract media URL')
2239             return
2240         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2241
2242         mobj = re.search('<title>([^<]+)</title>', webpage)
2243         if mobj is None:
2244             self._downloader.trouble(u'ERROR: unable to extract title')
2245             return
2246
2247         video_title = mobj.group(1)
2248
2249         return [{
2250             'id':       video_id,
2251             'url':      video_url,
2252             'uploader': None,
2253             'upload_date':  None,
2254             'title':    video_title,
2255             'ext':      u'flv',
2256         }]
2257
2258 class ComedyCentralIE(InfoExtractor):
2259     """Information extractor for The Daily Show and Colbert Report """
2260
2261     # urls can be abbreviations like :thedailyshow or :colbert
2262     # urls for episodes like:
2263     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2264     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2265     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2266     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2267                       |(https?://)?(www\.)?
2268                           (?P<showname>thedailyshow|colbertnation)\.com/
2269                          (full-episodes/(?P<episode>.*)|
2270                           (?P<clip>
2271                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2272                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2273                      $"""
2274
2275     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2276
2277     _video_extensions = {
2278         '3500': 'mp4',
2279         '2200': 'mp4',
2280         '1700': 'mp4',
2281         '1200': 'mp4',
2282         '750': 'mp4',
2283         '400': 'mp4',
2284     }
2285     _video_dimensions = {
2286         '3500': '1280x720',
2287         '2200': '960x540',
2288         '1700': '768x432',
2289         '1200': '640x360',
2290         '750': '512x288',
2291         '400': '384x216',
2292     }
2293
2294     def suitable(self, url):
2295         """Receives a URL and returns True if suitable for this IE."""
2296         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2297
2298     def report_extraction(self, episode_id):
2299         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2300
2301     def report_config_download(self, episode_id, media_id):
2302         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2303
2304     def report_index_download(self, episode_id):
2305         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2306
2307     def _print_formats(self, formats):
2308         print('Available formats:')
2309         for x in formats:
2310             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2311
2312
2313     def _real_extract(self, url):
2314         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2315         if mobj is None:
2316             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2317             return
2318
2319         if mobj.group('shortname'):
2320             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2321                 url = u'http://www.thedailyshow.com/full-episodes/'
2322             else:
2323                 url = u'http://www.colbertnation.com/full-episodes/'
2324             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2325             assert mobj is not None
2326
2327         if mobj.group('clip'):
2328             if mobj.group('showname') == 'thedailyshow':
2329                 epTitle = mobj.group('tdstitle')
2330             else:
2331                 epTitle = mobj.group('cntitle')
2332             dlNewest = False
2333         else:
2334             dlNewest = not mobj.group('episode')
2335             if dlNewest:
2336                 epTitle = mobj.group('showname')
2337             else:
2338                 epTitle = mobj.group('episode')
2339
2340         req = compat_urllib_request.Request(url)
2341         self.report_extraction(epTitle)
2342         try:
2343             htmlHandle = compat_urllib_request.urlopen(req)
2344             html = htmlHandle.read()
2345             webpage = html.decode('utf-8')
2346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2347             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2348             return
2349         if dlNewest:
2350             url = htmlHandle.geturl()
2351             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352             if mobj is None:
2353                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2354                 return
2355             if mobj.group('episode') == '':
2356                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2357                 return
2358             epTitle = mobj.group('episode')
2359
2360         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2361
2362         if len(mMovieParams) == 0:
2363             # The Colbert Report embeds the information in a without
2364             # a URL prefix; so extract the alternate reference
2365             # and then add the URL prefix manually.
2366
2367             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2368             if len(altMovieParams) == 0:
2369                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2370                 return
2371             else:
2372                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2373
2374         uri = mMovieParams[0][1]
2375         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2376         self.report_index_download(epTitle)
2377         try:
2378             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2379         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2380             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2381             return
2382
2383         results = []
2384
2385         idoc = xml.etree.ElementTree.fromstring(indexXml)
2386         itemEls = idoc.findall('.//item')
2387         for partNum,itemEl in enumerate(itemEls):
2388             mediaId = itemEl.findall('./guid')[0].text
2389             shortMediaId = mediaId.split(':')[-1]
2390             showId = mediaId.split(':')[-2].replace('.com', '')
2391             officialTitle = itemEl.findall('./title')[0].text
2392             officialDate = itemEl.findall('./pubDate')[0].text
2393
2394             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2395                         compat_urllib_parse.urlencode({'uri': mediaId}))
2396             configReq = compat_urllib_request.Request(configUrl)
2397             self.report_config_download(epTitle, shortMediaId)
2398             try:
2399                 configXml = compat_urllib_request.urlopen(configReq).read()
2400             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2401                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2402                 return
2403
2404             cdoc = xml.etree.ElementTree.fromstring(configXml)
2405             turls = []
2406             for rendition in cdoc.findall('.//rendition'):
2407                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2408                 turls.append(finfo)
2409
2410             if len(turls) == 0:
2411                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2412                 continue
2413
2414             if self._downloader.params.get('listformats', None):
2415                 self._print_formats([i[0] for i in turls])
2416                 return
2417
2418             # For now, just pick the highest bitrate
2419             format,rtmp_video_url = turls[-1]
2420
2421             # Get the format arg from the arg stream
2422             req_format = self._downloader.params.get('format', None)
2423
2424             # Select format if we can find one
2425             for f,v in turls:
2426                 if f == req_format:
2427                     format, rtmp_video_url = f, v
2428                     break
2429
2430             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2431             if not m:
2432                 raise ExtractorError(u'Cannot transform RTMP url')
2433             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2434             video_url = base + m.group('finalid')
2435
2436             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2437             info = {
2438                 'id': shortMediaId,
2439                 'url': video_url,
2440                 'uploader': showId,
2441                 'upload_date': officialDate,
2442                 'title': effTitle,
2443                 'ext': 'mp4',
2444                 'format': format,
2445                 'thumbnail': None,
2446                 'description': officialTitle,
2447             }
2448             results.append(info)
2449
2450         return results
2451
2452
2453 class EscapistIE(InfoExtractor):
2454     """Information extractor for The Escapist """
2455
2456     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2457     IE_NAME = u'escapist'
2458
2459     def report_extraction(self, showName):
2460         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2461
2462     def report_config_download(self, showName):
2463         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2464
2465     def _real_extract(self, url):
2466         mobj = re.match(self._VALID_URL, url)
2467         if mobj is None:
2468             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2469             return
2470         showName = mobj.group('showname')
2471         videoId = mobj.group('episode')
2472
2473         self.report_extraction(showName)
2474         try:
2475             webPage = compat_urllib_request.urlopen(url)
2476             webPageBytes = webPage.read()
2477             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2478             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2479         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2481             return
2482
2483         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2484         description = unescapeHTML(descMatch.group(1))
2485         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2486         imgUrl = unescapeHTML(imgMatch.group(1))
2487         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2488         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2489         configUrlMatch = re.search('config=(.*)$', playerUrl)
2490         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2491
2492         self.report_config_download(showName)
2493         try:
2494             configJSON = compat_urllib_request.urlopen(configUrl)
2495             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2496             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2499             return
2500
2501         # Technically, it's JavaScript, not JSON
2502         configJSON = configJSON.replace("'", '"')
2503
2504         try:
2505             config = json.loads(configJSON)
2506         except (ValueError,) as err:
2507             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2508             return
2509
2510         playlist = config['playlist']
2511         videoUrl = playlist[1]['url']
2512
2513         info = {
2514             'id': videoId,
2515             'url': videoUrl,
2516             'uploader': showName,
2517             'upload_date': None,
2518             'title': showName,
2519             'ext': 'flv',
2520             'thumbnail': imgUrl,
2521             'description': description,
2522             'player_url': playerUrl,
2523         }
2524
2525         return [info]
2526
2527 class CollegeHumorIE(InfoExtractor):
2528     """Information extractor for collegehumor.com"""
2529
2530     _WORKING = False
2531     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2532     IE_NAME = u'collegehumor'
2533
2534     def report_manifest(self, video_id):
2535         """Report information extraction."""
2536         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2537
2538     def report_extraction(self, video_id):
2539         """Report information extraction."""
2540         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2541
2542     def _real_extract(self, url):
2543         mobj = re.match(self._VALID_URL, url)
2544         if mobj is None:
2545             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2546             return
2547         video_id = mobj.group('videoid')
2548
2549         info = {
2550             'id': video_id,
2551             'uploader': None,
2552             'upload_date': None,
2553         }
2554
2555         self.report_extraction(video_id)
2556         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2557         try:
2558             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2559         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2561             return
2562
2563         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2564         try:
2565             videoNode = mdoc.findall('./video')[0]
2566             info['description'] = videoNode.findall('./description')[0].text
2567             info['title'] = videoNode.findall('./caption')[0].text
2568             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2569             manifest_url = videoNode.findall('./file')[0].text
2570         except IndexError:
2571             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2572             return
2573
2574         manifest_url += '?hdcore=2.10.3'
2575         self.report_manifest(video_id)
2576         try:
2577             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2578         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2580             return
2581
2582         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2583         try:
2584             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2585             node_id = media_node.attrib['url']
2586             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2587         except IndexError as err:
2588             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2589             return
2590
2591         url_pr = compat_urllib_parse_urlparse(manifest_url)
2592         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2593
2594         info['url'] = url
2595         info['ext'] = 'f4f'
2596         return [info]
2597
2598
2599 class XVideosIE(InfoExtractor):
2600     """Information extractor for xvideos.com"""
2601
2602     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2603     IE_NAME = u'xvideos'
2604
2605     def report_extraction(self, video_id):
2606         """Report information extraction."""
2607         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2608
2609     def _real_extract(self, url):
2610         mobj = re.match(self._VALID_URL, url)
2611         if mobj is None:
2612             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613             return
2614         video_id = mobj.group(1)
2615
2616         webpage = self._download_webpage(url, video_id)
2617
2618         self.report_extraction(video_id)
2619
2620
2621         # Extract video URL
2622         mobj = re.search(r'flv_url=(.+?)&', webpage)
2623         if mobj is None:
2624             self._downloader.trouble(u'ERROR: unable to extract video url')
2625             return
2626         video_url = compat_urllib_parse.unquote(mobj.group(1))
2627
2628
2629         # Extract title
2630         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2631         if mobj is None:
2632             self._downloader.trouble(u'ERROR: unable to extract video title')
2633             return
2634         video_title = mobj.group(1)
2635
2636
2637         # Extract video thumbnail
2638         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2639         if mobj is None:
2640             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2641             return
2642         video_thumbnail = mobj.group(0)
2643
2644         info = {
2645             'id': video_id,
2646             'url': video_url,
2647             'uploader': None,
2648             'upload_date': None,
2649             'title': video_title,
2650             'ext': 'flv',
2651             'thumbnail': video_thumbnail,
2652             'description': None,
2653         }
2654
2655         return [info]
2656
2657
2658 class SoundcloudIE(InfoExtractor):
2659     """Information extractor for soundcloud.com
2660        To access the media, the uid of the song and a stream token
2661        must be extracted from the page source and the script must make
2662        a request to media.soundcloud.com/crossdomain.xml. Then
2663        the media can be grabbed by requesting from an url composed
2664        of the stream token and uid
2665      """
2666
2667     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2668     IE_NAME = u'soundcloud'
2669
2670     def __init__(self, downloader=None):
2671         InfoExtractor.__init__(self, downloader)
2672
2673     def report_resolve(self, video_id):
2674         """Report information extraction."""
2675         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2676
2677     def report_extraction(self, video_id):
2678         """Report information extraction."""
2679         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2680
2681     def _real_extract(self, url):
2682         mobj = re.match(self._VALID_URL, url)
2683         if mobj is None:
2684             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2685             return
2686
2687         # extract uploader (which is in the url)
2688         uploader = mobj.group(1)
2689         # extract simple title (uploader + slug of song title)
2690         slug_title =  mobj.group(2)
2691         simple_title = uploader + u'-' + slug_title
2692
2693         self.report_resolve('%s/%s' % (uploader, slug_title))
2694
2695         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2696         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2697         request = compat_urllib_request.Request(resolv_url)
2698         try:
2699             info_json_bytes = compat_urllib_request.urlopen(request).read()
2700             info_json = info_json_bytes.decode('utf-8')
2701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2702             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2703             return
2704
2705         info = json.loads(info_json)
2706         video_id = info['id']
2707         self.report_extraction('%s/%s' % (uploader, slug_title))
2708
2709         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2710         request = compat_urllib_request.Request(streams_url)
2711         try:
2712             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2713             stream_json = stream_json_bytes.decode('utf-8')
2714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2716             return
2717
2718         streams = json.loads(stream_json)
2719         mediaURL = streams['http_mp3_128_url']
2720
2721         return [{
2722             'id':       info['id'],
2723             'url':      mediaURL,
2724             'uploader': info['user']['username'],
2725             'upload_date':  info['created_at'],
2726             'title':    info['title'],
2727             'ext':      u'mp3',
2728             'description': info['description'],
2729         }]
2730
2731
2732 class InfoQIE(InfoExtractor):
2733     """Information extractor for infoq.com"""
2734     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2735
2736     def report_extraction(self, video_id):
2737         """Report information extraction."""
2738         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2739
2740     def _real_extract(self, url):
2741         mobj = re.match(self._VALID_URL, url)
2742         if mobj is None:
2743             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2744             return
2745
2746         webpage = self._download_webpage(url, video_id=url)
2747         self.report_extraction(url)
2748
2749         # Extract video URL
2750         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2751         if mobj is None:
2752             self._downloader.trouble(u'ERROR: unable to extract video url')
2753             return
2754         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2755         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2756
2757         # Extract title
2758         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2759         if mobj is None:
2760             self._downloader.trouble(u'ERROR: unable to extract video title')
2761             return
2762         video_title = mobj.group(1)
2763
2764         # Extract description
2765         video_description = u'No description available.'
2766         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2767         if mobj is not None:
2768             video_description = mobj.group(1)
2769
2770         video_filename = video_url.split('/')[-1]
2771         video_id, extension = video_filename.split('.')
2772
2773         info = {
2774             'id': video_id,
2775             'url': video_url,
2776             'uploader': None,
2777             'upload_date': None,
2778             'title': video_title,
2779             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2780             'thumbnail': None,
2781             'description': video_description,
2782         }
2783
2784         return [info]
2785
2786 class MixcloudIE(InfoExtractor):
2787     """Information extractor for www.mixcloud.com"""
2788
2789     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2790     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2791     IE_NAME = u'mixcloud'
2792
2793     def __init__(self, downloader=None):
2794         InfoExtractor.__init__(self, downloader)
2795
2796     def report_download_json(self, file_id):
2797         """Report JSON download."""
2798         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2799
2800     def report_extraction(self, file_id):
2801         """Report information extraction."""
2802         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2803
2804     def get_urls(self, jsonData, fmt, bitrate='best'):
2805         """Get urls from 'audio_formats' section in json"""
2806         file_url = None
2807         try:
2808             bitrate_list = jsonData[fmt]
2809             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2810                 bitrate = max(bitrate_list) # select highest
2811
2812             url_list = jsonData[fmt][bitrate]
2813         except TypeError: # we have no bitrate info.
2814             url_list = jsonData[fmt]
2815         return url_list
2816
2817     def check_urls(self, url_list):
2818         """Returns 1st active url from list"""
2819         for url in url_list:
2820             try:
2821                 compat_urllib_request.urlopen(url)
2822                 return url
2823             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2824                 url = None
2825
2826         return None
2827
2828     def _print_formats(self, formats):
2829         print('Available formats:')
2830         for fmt in formats.keys():
2831             for b in formats[fmt]:
2832                 try:
2833                     ext = formats[fmt][b][0]
2834                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2835                 except TypeError: # we have no bitrate info
2836                     ext = formats[fmt][0]
2837                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2838                     break
2839
2840     def _real_extract(self, url):
2841         mobj = re.match(self._VALID_URL, url)
2842         if mobj is None:
2843             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2844             return
2845         # extract uploader & filename from url
2846         uploader = mobj.group(1).decode('utf-8')
2847         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2848
2849         # construct API request
2850         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2851         # retrieve .json file with links to files
2852         request = compat_urllib_request.Request(file_url)
2853         try:
2854             self.report_download_json(file_url)
2855             jsonData = compat_urllib_request.urlopen(request).read()
2856         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2857             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2858             return
2859
2860         # parse JSON
2861         json_data = json.loads(jsonData)
2862         player_url = json_data['player_swf_url']
2863         formats = dict(json_data['audio_formats'])
2864
2865         req_format = self._downloader.params.get('format', None)
2866         bitrate = None
2867
2868         if self._downloader.params.get('listformats', None):
2869             self._print_formats(formats)
2870             return
2871
2872         if req_format is None or req_format == 'best':
2873             for format_param in formats.keys():
2874                 url_list = self.get_urls(formats, format_param)
2875                 # check urls
2876                 file_url = self.check_urls(url_list)
2877                 if file_url is not None:
2878                     break # got it!
2879         else:
2880             if req_format not in formats:
2881                 self._downloader.trouble(u'ERROR: format is not available')
2882                 return
2883
2884             url_list = self.get_urls(formats, req_format)
2885             file_url = self.check_urls(url_list)
2886             format_param = req_format
2887
2888         return [{
2889             'id': file_id.decode('utf-8'),
2890             'url': file_url.decode('utf-8'),
2891             'uploader': uploader.decode('utf-8'),
2892             'upload_date': None,
2893             'title': json_data['name'],
2894             'ext': file_url.split('.')[-1].decode('utf-8'),
2895             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2896             'thumbnail': json_data['thumbnail_url'],
2897             'description': json_data['description'],
2898             'player_url': player_url.decode('utf-8'),
2899         }]
2900
2901 class StanfordOpenClassroomIE(InfoExtractor):
2902     """Information extractor for Stanford's Open ClassRoom"""
2903
2904     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2905     IE_NAME = u'stanfordoc'
2906
2907     def report_download_webpage(self, objid):
2908         """Report information extraction."""
2909         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2910
2911     def report_extraction(self, video_id):
2912         """Report information extraction."""
2913         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2914
2915     def _real_extract(self, url):
2916         mobj = re.match(self._VALID_URL, url)
2917         if mobj is None:
2918             raise ExtractorError(u'Invalid URL: %s' % url)
2919
2920         if mobj.group('course') and mobj.group('video'): # A specific video
2921             course = mobj.group('course')
2922             video = mobj.group('video')
2923             info = {
2924                 'id': course + '_' + video,
2925                 'uploader': None,
2926                 'upload_date': None,
2927             }
2928
2929             self.report_extraction(info['id'])
2930             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2931             xmlUrl = baseUrl + video + '.xml'
2932             try:
2933                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2934             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2935                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2936                 return
2937             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2938             try:
2939                 info['title'] = mdoc.findall('./title')[0].text
2940                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2941             except IndexError:
2942                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2943                 return
2944             info['ext'] = info['url'].rpartition('.')[2]
2945             return [info]
2946         elif mobj.group('course'): # A course page
2947             course = mobj.group('course')
2948             info = {
2949                 'id': course,
2950                 'type': 'playlist',
2951                 'uploader': None,
2952                 'upload_date': None,
2953             }
2954
2955             coursepage = self._download_webpage(url, info['id'],
2956                                         note='Downloading course info page',
2957                                         errnote='Unable to download course info page')
2958
2959             m = re.search('<h1>([^<]+)</h1>', coursepage)
2960             if m:
2961                 info['title'] = unescapeHTML(m.group(1))
2962             else:
2963                 info['title'] = info['id']
2964
2965             m = re.search('<description>([^<]+)</description>', coursepage)
2966             if m:
2967                 info['description'] = unescapeHTML(m.group(1))
2968
2969             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2970             info['list'] = [
2971                 {
2972                     'type': 'reference',
2973                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2974                 }
2975                     for vpage in links]
2976             results = []
2977             for entry in info['list']:
2978                 assert entry['type'] == 'reference'
2979                 results += self.extract(entry['url'])
2980             return results
2981         else: # Root page
2982             info = {
2983                 'id': 'Stanford OpenClassroom',
2984                 'type': 'playlist',
2985                 'uploader': None,
2986                 'upload_date': None,
2987             }
2988
2989             self.report_download_webpage(info['id'])
2990             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2991             try:
2992                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2993             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2994                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2995                 return
2996
2997             info['title'] = info['id']
2998
2999             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3000             info['list'] = [
3001                 {
3002                     'type': 'reference',
3003                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3004                 }
3005                     for cpage in links]
3006
3007             results = []
3008             for entry in info['list']:
3009                 assert entry['type'] == 'reference'
3010                 results += self.extract(entry['url'])
3011             return results
3012
3013 class MTVIE(InfoExtractor):
3014     """Information extractor for MTV.com"""
3015
3016     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3017     IE_NAME = u'mtv'
3018
3019     def report_extraction(self, video_id):
3020         """Report information extraction."""
3021         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3022
3023     def _real_extract(self, url):
3024         mobj = re.match(self._VALID_URL, url)
3025         if mobj is None:
3026             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3027             return
3028         if not mobj.group('proto'):
3029             url = 'http://' + url
3030         video_id = mobj.group('videoid')
3031
3032         webpage = self._download_webpage(url, video_id)
3033
3034         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3035         if mobj is None:
3036             self._downloader.trouble(u'ERROR: unable to extract song name')
3037             return
3038         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3039         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3040         if mobj is None:
3041             self._downloader.trouble(u'ERROR: unable to extract performer')
3042             return
3043         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3044         video_title = performer + ' - ' + song_name
3045
3046         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3047         if mobj is None:
3048             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3049             return
3050         mtvn_uri = mobj.group(1)
3051
3052         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3053         if mobj is None:
3054             self._downloader.trouble(u'ERROR: unable to extract content id')
3055             return
3056         content_id = mobj.group(1)
3057
3058         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3059         self.report_extraction(video_id)
3060         request = compat_urllib_request.Request(videogen_url)
3061         try:
3062             metadataXml = compat_urllib_request.urlopen(request).read()
3063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3064             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3065             return
3066
3067         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3068         renditions = mdoc.findall('.//rendition')
3069
3070         # For now, always pick the highest quality.
3071         rendition = renditions[-1]
3072
3073         try:
3074             _,_,ext = rendition.attrib['type'].partition('/')
3075             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3076             video_url = rendition.find('./src').text
3077         except KeyError:
3078             self._downloader.trouble('Invalid rendition field.')
3079             return
3080
3081         info = {
3082             'id': video_id,
3083             'url': video_url,
3084             'uploader': performer,
3085             'upload_date': None,
3086             'title': video_title,
3087             'ext': ext,
3088             'format': format,
3089         }
3090
3091         return [info]
3092
3093
3094 class YoukuIE(InfoExtractor):
3095     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3096
3097     def report_download_webpage(self, file_id):
3098         """Report webpage download."""
3099         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3100
3101     def report_extraction(self, file_id):
3102         """Report information extraction."""
3103         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3104
3105     def _gen_sid(self):
3106         nowTime = int(time.time() * 1000)
3107         random1 = random.randint(1000,1998)
3108         random2 = random.randint(1000,9999)
3109
3110         return "%d%d%d" %(nowTime,random1,random2)
3111
3112     def _get_file_ID_mix_string(self, seed):
3113         mixed = []
3114         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3115         seed = float(seed)
3116         for i in range(len(source)):
3117             seed  =  (seed * 211 + 30031 ) % 65536
3118             index  =  math.floor(seed / 65536 * len(source) )
3119             mixed.append(source[int(index)])
3120             source.remove(source[int(index)])
3121         #return ''.join(mixed)
3122         return mixed
3123
3124     def _get_file_id(self, fileId, seed):
3125         mixed = self._get_file_ID_mix_string(seed)
3126         ids = fileId.split('*')
3127         realId = []
3128         for ch in ids:
3129             if ch:
3130                 realId.append(mixed[int(ch)])
3131         return ''.join(realId)
3132
3133     def _real_extract(self, url):
3134         mobj = re.match(self._VALID_URL, url)
3135         if mobj is None:
3136             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3137             return
3138         video_id = mobj.group('ID')
3139
3140         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3141
3142         request = compat_urllib_request.Request(info_url, None, std_headers)
3143         try:
3144             self.report_download_webpage(video_id)
3145             jsondata = compat_urllib_request.urlopen(request).read()
3146         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3147             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3148             return
3149
3150         self.report_extraction(video_id)
3151         try:
3152             jsonstr = jsondata.decode('utf-8')
3153             config = json.loads(jsonstr)
3154
3155             video_title =  config['data'][0]['title']
3156             seed = config['data'][0]['seed']
3157
3158             format = self._downloader.params.get('format', None)
3159             supported_format = list(config['data'][0]['streamfileids'].keys())
3160
3161             if format is None or format == 'best':
3162                 if 'hd2' in supported_format:
3163                     format = 'hd2'
3164                 else:
3165                     format = 'flv'
3166                 ext = u'flv'
3167             elif format == 'worst':
3168                 format = 'mp4'
3169                 ext = u'mp4'
3170             else:
3171                 format = 'flv'
3172                 ext = u'flv'
3173
3174
3175             fileid = config['data'][0]['streamfileids'][format]
3176             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3177         except (UnicodeDecodeError, ValueError, KeyError):
3178             self._downloader.trouble(u'ERROR: unable to extract info section')
3179             return
3180
3181         files_info=[]
3182         sid = self._gen_sid()
3183         fileid = self._get_file_id(fileid, seed)
3184
3185         #column 8,9 of fileid represent the segment number
3186         #fileid[7:9] should be changed
3187         for index, key in enumerate(keys):
3188
3189             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3190             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3191
3192             info = {
3193                 'id': '%s_part%02d' % (video_id, index),
3194                 'url': download_url,
3195                 'uploader': None,
3196                 'upload_date': None,
3197                 'title': video_title,
3198                 'ext': ext,
3199             }
3200             files_info.append(info)
3201
3202         return files_info
3203
3204
3205 class XNXXIE(InfoExtractor):
3206     """Information extractor for xnxx.com"""
3207
3208     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3209     IE_NAME = u'xnxx'
3210     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3211     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3212     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3213
3214     def report_webpage(self, video_id):
3215         """Report information extraction"""
3216         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3217
3218     def report_extraction(self, video_id):
3219         """Report information extraction"""
3220         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3221
3222     def _real_extract(self, url):
3223         mobj = re.match(self._VALID_URL, url)
3224         if mobj is None:
3225             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3226             return
3227         video_id = mobj.group(1)
3228
3229         self.report_webpage(video_id)
3230
3231         # Get webpage content
3232         try:
3233             webpage_bytes = compat_urllib_request.urlopen(url).read()
3234             webpage = webpage_bytes.decode('utf-8')
3235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3237             return
3238
3239         result = re.search(self.VIDEO_URL_RE, webpage)
3240         if result is None:
3241             self._downloader.trouble(u'ERROR: unable to extract video url')
3242             return
3243         video_url = compat_urllib_parse.unquote(result.group(1))
3244
3245         result = re.search(self.VIDEO_TITLE_RE, webpage)
3246         if result is None:
3247             self._downloader.trouble(u'ERROR: unable to extract video title')
3248             return
3249         video_title = result.group(1)
3250
3251         result = re.search(self.VIDEO_THUMB_RE, webpage)
3252         if result is None:
3253             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3254             return
3255         video_thumbnail = result.group(1)
3256
3257         return [{
3258             'id': video_id,
3259             'url': video_url,
3260             'uploader': None,
3261             'upload_date': None,
3262             'title': video_title,
3263             'ext': 'flv',
3264             'thumbnail': video_thumbnail,
3265             'description': None,
3266         }]
3267
3268
3269 class GooglePlusIE(InfoExtractor):
3270     """Information extractor for plus.google.com."""
3271
3272     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3273     IE_NAME = u'plus.google'
3274
3275     def __init__(self, downloader=None):
3276         InfoExtractor.__init__(self, downloader)
3277
3278     def report_extract_entry(self, url):
3279         """Report downloading extry"""
3280         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3281
3282     def report_date(self, upload_date):
3283         """Report downloading extry"""
3284         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3285
3286     def report_uploader(self, uploader):
3287         """Report downloading extry"""
3288         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3289
3290     def report_title(self, video_title):
3291         """Report downloading extry"""
3292         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3293
3294     def report_extract_vid_page(self, video_page):
3295         """Report information extraction."""
3296         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3297
3298     def _real_extract(self, url):
3299         # Extract id from URL
3300         mobj = re.match(self._VALID_URL, url)
3301         if mobj is None:
3302             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3303             return
3304
3305         post_url = mobj.group(0)
3306         video_id = mobj.group(1)
3307
3308         video_extension = 'flv'
3309
3310         # Step 1, Retrieve post webpage to extract further information
3311         self.report_extract_entry(post_url)
3312         request = compat_urllib_request.Request(post_url)
3313         try:
3314             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3316             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3317             return
3318
3319         # Extract update date
3320         upload_date = None
3321         pattern = 'title="Timestamp">(.*?)</a>'
3322         mobj = re.search(pattern, webpage)
3323         if mobj:
3324             upload_date = mobj.group(1)
3325             # Convert timestring to a format suitable for filename
3326             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3327             upload_date = upload_date.strftime('%Y%m%d')
3328         self.report_date(upload_date)
3329
3330         # Extract uploader
3331         uploader = None
3332         pattern = r'rel\="author".*?>(.*?)</a>'
3333         mobj = re.search(pattern, webpage)
3334         if mobj:
3335             uploader = mobj.group(1)
3336         self.report_uploader(uploader)
3337
3338         # Extract title
3339         # Get the first line for title
3340         video_title = u'NA'
3341         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3342         mobj = re.search(pattern, webpage)
3343         if mobj:
3344             video_title = mobj.group(1)
3345         self.report_title(video_title)
3346
3347         # Step 2, Stimulate clicking the image box to launch video
3348         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3349         mobj = re.search(pattern, webpage)
3350         if mobj is None:
3351             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3352
3353         video_page = mobj.group(1)
3354         request = compat_urllib_request.Request(video_page)
3355         try:
3356             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3357         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3358             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3359             return
3360         self.report_extract_vid_page(video_page)
3361
3362
3363         # Extract video links on video page
3364         """Extract video links of all sizes"""
3365         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3366         mobj = re.findall(pattern, webpage)
3367         if len(mobj) == 0:
3368             self._downloader.trouble(u'ERROR: unable to extract video links')
3369
3370         # Sort in resolution
3371         links = sorted(mobj)
3372
3373         # Choose the lowest of the sort, i.e. highest resolution
3374         video_url = links[-1]
3375         # Only get the url. The resolution part in the tuple has no use anymore
3376         video_url = video_url[-1]
3377         # Treat escaped \u0026 style hex
3378         try:
3379             video_url = video_url.decode("unicode_escape")
3380         except AttributeError: # Python 3
3381             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3382
3383
3384         return [{
3385             'id':       video_id,
3386             'url':      video_url,
3387             'uploader': uploader,
3388             'upload_date':  upload_date,
3389             'title':    video_title,
3390             'ext':      video_extension,
3391         }]
3392
3393 class NBAIE(InfoExtractor):
3394     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3395     IE_NAME = u'nba'
3396
3397     def _real_extract(self, url):
3398         mobj = re.match(self._VALID_URL, url)
3399         if mobj is None:
3400             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3401             return
3402
3403         video_id = mobj.group(1)
3404         if video_id.endswith('/index.html'):
3405             video_id = video_id[:-len('/index.html')]
3406
3407         webpage = self._download_webpage(url, video_id)
3408
3409         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3410         def _findProp(rexp, default=None):
3411             m = re.search(rexp, webpage)
3412             if m:
3413                 return unescapeHTML(m.group(1))
3414             else:
3415                 return default
3416
3417         shortened_video_id = video_id.rpartition('/')[2]
3418         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3419         info = {
3420             'id': shortened_video_id,
3421             'url': video_url,
3422             'ext': 'mp4',
3423             'title': title,
3424             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3425             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3426         }
3427         return [info]
3428
3429 class JustinTVIE(InfoExtractor):
3430     """Information extractor for justin.tv and twitch.tv"""
3431     # TODO: One broadcast may be split into multiple videos. The key
3432     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3433     # starts at 1 and increases. Can we treat all parts as one video?
3434
3435     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3436         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3437     _JUSTIN_PAGE_LIMIT = 100
3438     IE_NAME = u'justin.tv'
3439
3440     def report_extraction(self, file_id):
3441         """Report information extraction."""
3442         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3443
3444     def report_download_page(self, channel, offset):
3445         """Report attempt to download a single page of videos."""
3446         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3447                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3448
3449     # Return count of items, list of *valid* items
3450     def _parse_page(self, url):
3451         try:
3452             urlh = compat_urllib_request.urlopen(url)
3453             webpage_bytes = urlh.read()
3454             webpage = webpage_bytes.decode('utf-8', 'ignore')
3455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3456             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3457             return
3458
3459         response = json.loads(webpage)
3460         if type(response) != list:
3461             error_text = response.get('error', 'unknown error')
3462             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3463             return
3464         info = []
3465         for clip in response:
3466             video_url = clip['video_file_url']
3467             if video_url:
3468                 video_extension = os.path.splitext(video_url)[1][1:]
3469                 video_date = re.sub('-', '', clip['start_time'][:10])
3470                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3471                 video_id = clip['id']
3472                 video_title = clip.get('title', video_id)
3473                 info.append({
3474                     'id': video_id,
3475                     'url': video_url,
3476                     'title': video_title,
3477                     'uploader': clip.get('channel_name', video_uploader_id),
3478                     'uploader_id': video_uploader_id,
3479                     'upload_date': video_date,
3480                     'ext': video_extension,
3481                 })
3482         return (len(response), info)
3483
3484     def _real_extract(self, url):
3485         mobj = re.match(self._VALID_URL, url)
3486         if mobj is None:
3487             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3488             return
3489
3490         api = 'http://api.justin.tv'
3491         video_id = mobj.group(mobj.lastindex)
3492         paged = False
3493         if mobj.lastindex == 1:
3494             paged = True
3495             api += '/channel/archives/%s.json'
3496         else:
3497             api += '/broadcast/by_archive/%s.json'
3498         api = api % (video_id,)
3499
3500         self.report_extraction(video_id)
3501
3502         info = []
3503         offset = 0
3504         limit = self._JUSTIN_PAGE_LIMIT
3505         while True:
3506             if paged:
3507                 self.report_download_page(video_id, offset)
3508             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3509             page_count, page_info = self._parse_page(page_url)
3510             info.extend(page_info)
3511             if not paged or page_count != limit:
3512                 break
3513             offset += limit
3514         return info
3515
3516 class FunnyOrDieIE(InfoExtractor):
3517     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3518
3519     def _real_extract(self, url):
3520         mobj = re.match(self._VALID_URL, url)
3521         if mobj is None:
3522             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3523             return
3524
3525         video_id = mobj.group('id')
3526         webpage = self._download_webpage(url, video_id)
3527
3528         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3529         if not m:
3530             self._downloader.trouble(u'ERROR: unable to find video information')
3531         video_url = unescapeHTML(m.group('url'))
3532
3533         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3534         if not m:
3535             self._downloader.trouble(u'Cannot find video title')
3536         title = unescapeHTML(m.group('title'))
3537
3538         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3539         if m:
3540             desc = unescapeHTML(m.group('desc'))
3541         else:
3542             desc = None
3543
3544         info = {
3545             'id': video_id,
3546             'url': video_url,
3547             'ext': 'mp4',
3548             'title': title,
3549             'description': desc,
3550         }
3551         return [info]
3552
3553 class TweetReelIE(InfoExtractor):
3554     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3555
3556     def _real_extract(self, url):
3557         mobj = re.match(self._VALID_URL, url)
3558         if mobj is None:
3559             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3560             return
3561
3562         video_id = mobj.group('id')
3563         webpage = self._download_webpage(url, video_id)
3564
3565         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3566         if not m:
3567             self._downloader.trouble(u'ERROR: Cannot find status ID')
3568         status_id = m.group(1)
3569
3570         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3571         if not m:
3572             self._downloader.trouble(u'WARNING: Cannot find description')
3573         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3574
3575         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3576         if not m:
3577             self._downloader.trouble(u'ERROR: Cannot find uploader')
3578         uploader = unescapeHTML(m.group('uploader'))
3579         uploader_id = unescapeHTML(m.group('uploader_id'))
3580
3581         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3582         if not m:
3583             self._downloader.trouble(u'ERROR: Cannot find upload date')
3584         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3585
3586         title = desc
3587         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3588
3589         info = {
3590             'id': video_id,
3591             'url': video_url,
3592             'ext': 'mov',
3593             'title': title,
3594             'description': desc,
3595             'uploader': uploader,
3596             'uploader_id': uploader_id,
3597             'internal_id': status_id,
3598             'upload_date': upload_date
3599         }
3600         return [info]
3601         
3602 class SteamIE(InfoExtractor):
3603     _VALID_URL = r"""http://store.steampowered.com/ 
3604                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3605                 (?P<gameID>\d+)/?
3606                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3607                 """
3608
3609     def suitable(self, url):
3610         """Receives a URL and returns True if suitable for this IE."""
3611         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3612
3613     def _real_extract(self, url):
3614         m = re.match(self._VALID_URL, url, re.VERBOSE)
3615         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3616         gameID = m.group('gameID')
3617         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3618         webpage = self._download_webpage(videourl, gameID)
3619         mweb = re.finditer(urlRE, webpage)
3620         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3621         titles = re.finditer(namesRE, webpage)
3622         videos = []
3623         for vid,vtitle in zip(mweb,titles):
3624             video_id = vid.group('videoID')
3625             title = vtitle.group('videoName')
3626             video_url = vid.group('videoURL')
3627             if not video_url:
3628                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3629             info = {
3630                 'id':video_id,
3631                 'url':video_url,
3632                 'ext': 'flv',
3633                 'title': unescapeHTML(title)
3634                   }
3635             videos.append(info)
3636         return videos
3637
3638 class UstreamIE(InfoExtractor):
3639     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3640     IE_NAME = u'ustream'
3641
3642     def _real_extract(self, url):
3643         m = re.match(self._VALID_URL, url)
3644         video_id = m.group('videoID')
3645         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3646         webpage = self._download_webpage(url, video_id)
3647         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3648         title = m.group('title')
3649         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3650         uploader = m.group('uploader')
3651         info = {
3652                 'id':video_id,
3653                 'url':video_url,
3654                 'ext': 'flv',
3655                 'title': title,
3656                 'uploader': uploader
3657                   }
3658         return [info]
3659
3660 class RBMARadioIE(InfoExtractor):
3661     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3662
3663     def _real_extract(self, url):
3664         m = re.match(self._VALID_URL, url)
3665         video_id = m.group('videoID')
3666
3667         webpage = self._download_webpage(url, video_id)
3668         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3669         if not m:
3670             raise ExtractorError(u'Cannot find metadata')
3671         json_data = m.group(1)
3672
3673         try:
3674             data = json.loads(json_data)
3675         except ValueError as e:
3676             raise ExtractorError(u'Invalid JSON: ' + str(e))
3677
3678         video_url = data['akamai_url'] + '&cbr=256'
3679         url_parts = compat_urllib_parse_urlparse(video_url)
3680         video_ext = url_parts.path.rpartition('.')[2]
3681         info = {
3682                 'id': video_id,
3683                 'url': video_url,
3684                 'ext': video_ext,
3685                 'title': data['title'],
3686                 'description': data.get('teaser_text'),
3687                 'location': data.get('country_of_origin'),
3688                 'uploader': data.get('host', {}).get('name'),
3689                 'uploader_id': data.get('host', {}).get('slug'),
3690                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3691                 'duration': data.get('duration'),
3692         }
3693         return [info]
3694
3695
3696 class YouPornIE(InfoExtractor):
3697     """Information extractor for youporn.com."""
3698     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3699    
3700     def _print_formats(self, formats):
3701         """Print all available formats"""
3702         print(u'Available formats:')
3703         print(u'ext\t\tformat')
3704         print(u'---------------------------------')
3705         for format in formats:
3706             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3707
3708     def _specific(self, req_format, formats):
3709         for x in formats:
3710             if(x["format"]==req_format):
3711                 return x
3712         return None
3713
3714     def _real_extract(self, url):
3715         mobj = re.match(self._VALID_URL, url)
3716         if mobj is None:
3717             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3718             return
3719
3720         video_id = mobj.group('videoid')
3721
3722         req = compat_urllib_request.Request(url)
3723         req.add_header('Cookie', 'age_verified=1')
3724         webpage = self._download_webpage(req, video_id)
3725
3726         # Get the video title
3727         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3728         if result is None:
3729             raise ExtractorError(u'ERROR: unable to extract video title')
3730         video_title = result.group('title').strip()
3731
3732         # Get the video date
3733         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3734         if result is None:
3735             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3736             upload_date = None
3737         else:
3738             upload_date = result.group('date').strip()
3739
3740         # Get the video uploader
3741         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3742         if result is None:
3743             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3744             video_uploader = None
3745         else:
3746             video_uploader = result.group('uploader').strip()
3747             video_uploader = clean_html( video_uploader )
3748
3749         # Get all of the formats available
3750         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3751         result = re.search(DOWNLOAD_LIST_RE, webpage)
3752         if result is None:
3753             raise ExtractorError(u'Unable to extract download list')
3754         download_list_html = result.group('download_list').strip()
3755
3756         # Get all of the links from the page
3757         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3758         links = re.findall(LINK_RE, download_list_html)
3759         if(len(links) == 0):
3760             raise ExtractorError(u'ERROR: no known formats available for video')
3761         
3762         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3763
3764         formats = []
3765         for link in links:
3766
3767             # A link looks like this:
3768             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3769             # A path looks like this:
3770             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3771             video_url = unescapeHTML( link )
3772             path = compat_urllib_parse_urlparse( video_url ).path
3773             extension = os.path.splitext( path )[1][1:]
3774             format = path.split('/')[4].split('_')[:2]
3775             size = format[0]
3776             bitrate = format[1]
3777             format = "-".join( format )
3778             title = u'%s-%s-%s' % (video_title, size, bitrate)
3779
3780             formats.append({
3781                 'id': video_id,
3782                 'url': video_url,
3783                 'uploader': video_uploader,
3784                 'upload_date': upload_date,
3785                 'title': title,
3786                 'ext': extension,
3787                 'format': format,
3788                 'thumbnail': None,
3789                 'description': None,
3790                 'player_url': None
3791             })
3792
3793         if self._downloader.params.get('listformats', None):
3794             self._print_formats(formats)
3795             return
3796
3797         req_format = self._downloader.params.get('format', None)
3798         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3799
3800         if req_format is None or req_format == 'best':
3801             return [formats[0]]
3802         elif req_format == 'worst':
3803             return [formats[-1]]
3804         elif req_format in ('-1', 'all'):
3805             return formats
3806         else:
3807             format = self._specific( req_format, formats )
3808             if result is None:
3809                 self._downloader.trouble(u'ERROR: requested format not available')
3810                 return
3811             return [format]
3812
3813         
3814
3815 class PornotubeIE(InfoExtractor):
3816     """Information extractor for pornotube.com."""
3817     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3818
3819     def _real_extract(self, url):
3820         mobj = re.match(self._VALID_URL, url)
3821         if mobj is None:
3822             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3823             return
3824
3825         video_id = mobj.group('videoid')
3826         video_title = mobj.group('title')
3827
3828         # Get webpage content
3829         webpage = self._download_webpage(url, video_id)
3830
3831         # Get the video URL
3832         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3833         result = re.search(VIDEO_URL_RE, webpage)
3834         if result is None:
3835             self._downloader.trouble(u'ERROR: unable to extract video url')
3836             return
3837         video_url = compat_urllib_parse.unquote(result.group('url'))
3838
3839         #Get the uploaded date
3840         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3841         result = re.search(VIDEO_UPLOADED_RE, webpage)
3842         if result is None:
3843             self._downloader.trouble(u'ERROR: unable to extract video title')
3844             return
3845         upload_date = result.group('date')
3846
3847         info = {'id': video_id,
3848                 'url': video_url,
3849                 'uploader': None,
3850                 'upload_date': upload_date,
3851                 'title': video_title,
3852                 'ext': 'flv',
3853                 'format': 'flv'}
3854
3855         return [info]
3856
3857 class YouJizzIE(InfoExtractor):
3858     """Information extractor for youjizz.com."""
3859     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3860
3861     def _real_extract(self, url):
3862         mobj = re.match(self._VALID_URL, url)
3863         if mobj is None:
3864             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3865             return
3866
3867         video_id = mobj.group('videoid')
3868
3869         # Get webpage content
3870         webpage = self._download_webpage(url, video_id)
3871
3872         # Get the video title
3873         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3874         if result is None:
3875             raise ExtractorError(u'ERROR: unable to extract video title')
3876         video_title = result.group('title').strip()
3877
3878         # Get the embed page
3879         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3880         if result is None:
3881             raise ExtractorError(u'ERROR: unable to extract embed page')
3882
3883         embed_page_url = result.group(0).strip()
3884         video_id = result.group('videoid')
3885     
3886         webpage = self._download_webpage(embed_page_url, video_id)
3887
3888         # Get the video URL
3889         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3890         if result is None:
3891             raise ExtractorError(u'ERROR: unable to extract video url')
3892         video_url = result.group('source')
3893
3894         info = {'id': video_id,
3895                 'url': video_url,
3896                 'title': video_title,
3897                 'ext': 'flv',
3898                 'format': 'flv',
3899                 'player_url': embed_page_url}
3900
3901         return [info]
3902
3903 class EightTracksIE(InfoExtractor):
3904     IE_NAME = '8tracks'
3905     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3906
3907     def _real_extract(self, url):
3908         mobj = re.match(self._VALID_URL, url)
3909         if mobj is None:
3910             raise ExtractorError(u'Invalid URL: %s' % url)
3911         playlist_id = mobj.group('id')
3912
3913         webpage = self._download_webpage(url, playlist_id)
3914
3915         m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3916         if not m:
3917             raise ExtractorError(u'Cannot find trax information')
3918         json_like = m.group(1)
3919         data = json.loads(json_like)
3920
3921         session = str(random.randint(0, 1000000000))
3922         mix_id = data['id']
3923         track_count = data['tracks_count']
3924         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3925         next_url = first_url
3926         res = []
3927         for i in itertools.count():
3928             api_json = self._download_webpage(next_url, playlist_id,
3929                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3930                 errnote=u'Failed to download song information')
3931             api_data = json.loads(api_json)
3932             track_data = api_data[u'set']['track']
3933             info = {
3934                 'id': track_data['id'],
3935                 'url': track_data['track_file_stream_url'],
3936                 'title': track_data['performer'] + u' - ' + track_data['name'],
3937                 'raw_title': track_data['name'],
3938                 'uploader_id': data['user']['login'],
3939                 'ext': 'm4a',
3940             }
3941             res.append(info)
3942             if api_data['set']['at_last_track']:
3943                 break
3944             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3945         return res
3946
3947 class KeekIE(InfoExtractor):
3948     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3949     IE_NAME = u'keek'
3950
3951     def _real_extract(self, url):
3952         m = re.match(self._VALID_URL, url)
3953         video_id = m.group('videoID')
3954         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3955         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3956         webpage = self._download_webpage(url, video_id)
3957         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3958         title = m.group('title')
3959         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3960         uploader = m.group('uploader')
3961         info = {
3962                 'id':video_id,
3963                 'url':video_url,
3964                 'ext': 'mp4',
3965                 'title': title,
3966                 'thumbnail': thumbnail,
3967                 'uploader': uploader
3968                   }
3969         return [info]
3970
3971 def gen_extractors():
3972     """ Return a list of an instance of every supported extractor.
3973     The order does matter; the first extractor matched is the one handling the URL.
3974     """
3975     return [
3976         YoutubePlaylistIE(),
3977         YoutubeChannelIE(),
3978         YoutubeUserIE(),
3979         YoutubeSearchIE(),
3980         YoutubeIE(),
3981         MetacafeIE(),
3982         DailymotionIE(),
3983         GoogleSearchIE(),
3984         PhotobucketIE(),
3985         YahooIE(),
3986         YahooSearchIE(),
3987         DepositFilesIE(),
3988         FacebookIE(),
3989         BlipTVUserIE(),
3990         BlipTVIE(),
3991         VimeoIE(),
3992         MyVideoIE(),
3993         ComedyCentralIE(),
3994         EscapistIE(),
3995         CollegeHumorIE(),
3996         XVideosIE(),
3997         SoundcloudIE(),
3998         InfoQIE(),
3999         MixcloudIE(),
4000         StanfordOpenClassroomIE(),
4001         MTVIE(),
4002         YoukuIE(),
4003         XNXXIE(),
4004         YouJizzIE(),
4005         PornotubeIE(),
4006         YouPornIE(),
4007         GooglePlusIE(),
4008         ArteTvIE(),
4009         NBAIE(),
4010         JustinTVIE(),
4011         FunnyOrDieIE(),
4012         TweetReelIE(),
4013         SteamIE(),
4014         UstreamIE(),
4015         RBMARadioIE(),
4016         EightTracksIE(),
4017         KeekIE(),
4018         GenericIE()
4019     ]
4020
4021