Added extractors for 3 porn sites
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import netrc
9 import os
10 import re
11 import socket
12 import time
13 import email.utils
14 import xml.etree.ElementTree
15 import random
16 import math
17 import urllib
18 import urllib2
19 import httplib
20 from urlparse import parse_qs, urlparse
21
22 from .utils import *
23
24
25 class InfoExtractor(object):
26     """Information Extractor class.
27
28     Information extractors are the classes that, given a URL, extract
29     information about the video (or videos) the URL refers to. This
30     information includes the real video URL, the video title, author and
31     others. The information is stored in a dictionary which is then
32     passed to the FileDownloader. The FileDownloader processes this
33     information possibly downloading the video to the file system, among
34     other possible outcomes.
35
36     The dictionaries must include the following fields:
37
38     id:             Video identifier.
39     url:            Final video URL.
40     title:          Video title, unescaped.
41     ext:            Video filename extension.
42     uploader:       Full name of the video uploader.
43     upload_date:    Video upload date (YYYYMMDD).
44
45     The following fields are optional:
46
47     format:         The video format, defaults to ext (used for --get-format)
48     thumbnail:      Full URL to a video thumbnail image.
49     description:    One-line video description.
50     uploader_id:    Nickname or id of the video uploader.
51     player_url:     SWF Player URL (used for rtmpdump).
52     subtitles:      The .srt file contents.
53     urlhandle:      [internal] The urlHandle to be used to download the file,
54                     like returned by urllib.request.urlopen
55
56     The fields should all be Unicode strings.
57
58     Subclasses of this one should re-define the _real_initialize() and
59     _real_extract() methods and define a _VALID_URL regexp.
60     Probably, they should also be added to the list of extractors.
61
62     _real_extract() must return a *list* of information dictionaries as
63     described above.
64
65     Finally, the _WORKING attribute should be set to False for broken IEs
66     in order to warn the users and skip the tests.
67     """
68
69     _ready = False
70     _downloader = None
71     _WORKING = True
72
73     def __init__(self, downloader=None):
74         """Constructor. Receives an optional downloader."""
75         self._ready = False
76         self.set_downloader(downloader)
77
78     def suitable(self, url):
79         """Receives a URL and returns True if suitable for this IE."""
80         return re.match(self._VALID_URL, url) is not None
81
82     def working(self):
83         """Getter method for _WORKING."""
84         return self._WORKING
85
86     def initialize(self):
87         """Initializes an instance (authentication, etc)."""
88         if not self._ready:
89             self._real_initialize()
90             self._ready = True
91
92     def extract(self, url):
93         """Extracts URL information and returns it in list of dicts."""
94         self.initialize()
95         return self._real_extract(url)
96
97     def set_downloader(self, downloader):
98         """Sets the downloader for this IE."""
99         self._downloader = downloader
100
101     def _real_initialize(self):
102         """Real initialization process. Redefine in subclasses."""
103         pass
104
105     def _real_extract(self, url):
106         """Real extraction process. Redefine in subclasses."""
107         pass
108
109     @property
110     def IE_NAME(self):
111         return type(self).__name__[:-2]
112
113     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
114         if note is None:
115             note = u'Downloading video webpage'
116         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117         try:
118             urlh = compat_urllib_request.urlopen(url_or_request)
119             webpage_bytes = urlh.read()
120             return webpage_bytes.decode('utf-8', 'replace')
121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
122             if errnote is None:
123                 errnote = u'Unable to download webpage'
124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
125
126
127 class YoutubeIE(InfoExtractor):
128     """Information extractor for youtube.com."""
129
130     _VALID_URL = r"""^
131                      (
132                          (?:https?://)?                                       # http(s):// (optional)
133                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
134                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
135                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
136                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
137                          (?:                                                  # the various things that can precede the ID:
138                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
139                              |(?:                                             # or the v= param in all its forms
140                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
142                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
143                                  v=
144                              )
145                          )?                                                   # optional -> youtube.com/xxxx is OK
146                      )?                                                       # all until now is optional -> you can pass the naked ID
147                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
148                      (?(1).+)?                                                # if we found the ID, everything can follow
149                      $"""
150     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
151     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
152     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
153     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
154     _NETRC_MACHINE = 'youtube'
155     # Listed in order of quality
156     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
157     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
158     _video_extensions = {
159         '13': '3gp',
160         '17': 'mp4',
161         '18': 'mp4',
162         '22': 'mp4',
163         '37': 'mp4',
164         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
165         '43': 'webm',
166         '44': 'webm',
167         '45': 'webm',
168         '46': 'webm',
169     }
170     _video_dimensions = {
171         '5': '240x400',
172         '6': '???',
173         '13': '???',
174         '17': '144x176',
175         '18': '360x640',
176         '22': '720x1280',
177         '34': '360x640',
178         '35': '480x854',
179         '37': '1080x1920',
180         '38': '3072x4096',
181         '43': '360x640',
182         '44': '480x854',
183         '45': '720x1280',
184         '46': '1080x1920',
185     }
186     IE_NAME = u'youtube'
187
188     def suitable(self, url):
189         """Receives a URL and returns True if suitable for this IE."""
190         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
191
192     def report_lang(self):
193         """Report attempt to set language."""
194         self._downloader.to_screen(u'[youtube] Setting language')
195
196     def report_login(self):
197         """Report attempt to log in."""
198         self._downloader.to_screen(u'[youtube] Logging in')
199
200     def report_age_confirmation(self):
201         """Report attempt to confirm age."""
202         self._downloader.to_screen(u'[youtube] Confirming age')
203
204     def report_video_webpage_download(self, video_id):
205         """Report attempt to download video webpage."""
206         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
207
208     def report_video_info_webpage_download(self, video_id):
209         """Report attempt to download video info webpage."""
210         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
211
212     def report_video_subtitles_download(self, video_id):
213         """Report attempt to download video info webpage."""
214         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
215
216     def report_information_extraction(self, video_id):
217         """Report attempt to extract video information."""
218         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
219
220     def report_unavailable_format(self, video_id, format):
221         """Report extracted video URL."""
222         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
223
224     def report_rtmp_download(self):
225         """Indicate the download will use the RTMP protocol."""
226         self._downloader.to_screen(u'[youtube] RTMP download detected')
227
228     def _closed_captions_xml_to_srt(self, xml_string):
229         srt = ''
230         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
231         # TODO parse xml instead of regex
232         for n, (start, dur_tag, dur, caption) in enumerate(texts):
233             if not dur: dur = '4'
234             start = float(start)
235             end = start + float(dur)
236             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
237             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
238             caption = unescapeHTML(caption)
239             caption = unescapeHTML(caption) # double cycle, intentional
240             srt += str(n+1) + '\n'
241             srt += start + ' --> ' + end + '\n'
242             srt += caption + '\n\n'
243         return srt
244
245     def _extract_subtitles(self, video_id):
246         self.report_video_subtitles_download(video_id)
247         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
248         try:
249             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
250         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
251             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
252         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
253         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
254         if not srt_lang_list:
255             return (u'WARNING: video has no closed captions', None)
256         if self._downloader.params.get('subtitleslang', False):
257             srt_lang = self._downloader.params.get('subtitleslang')
258         elif 'en' in srt_lang_list:
259             srt_lang = 'en'
260         else:
261             srt_lang = list(srt_lang_list.keys())[0]
262         if not srt_lang in srt_lang_list:
263             return (u'WARNING: no closed captions found in the specified language', None)
264         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
265         try:
266             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
267         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
268             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
269         if not srt_xml:
270             return (u'WARNING: unable to download video subtitles', None)
271         return (None, self._closed_captions_xml_to_srt(srt_xml))
272
273     def _print_formats(self, formats):
274         print('Available formats:')
275         for x in formats:
276             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
277
278     def _real_initialize(self):
279         if self._downloader is None:
280             return
281
282         username = None
283         password = None
284         downloader_params = self._downloader.params
285
286         # Attempt to use provided username and password or .netrc data
287         if downloader_params.get('username', None) is not None:
288             username = downloader_params['username']
289             password = downloader_params['password']
290         elif downloader_params.get('usenetrc', False):
291             try:
292                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
293                 if info is not None:
294                     username = info[0]
295                     password = info[2]
296                 else:
297                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
298             except (IOError, netrc.NetrcParseError) as err:
299                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
300                 return
301
302         # Set language
303         request = compat_urllib_request.Request(self._LANG_URL)
304         try:
305             self.report_lang()
306             compat_urllib_request.urlopen(request).read()
307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
308             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
309             return
310
311         # No authentication to be performed
312         if username is None:
313             return
314
315         # Log in
316         login_form = {
317                 'current_form': 'loginForm',
318                 'next':     '/',
319                 'action_login': 'Log In',
320                 'username': username,
321                 'password': password,
322                 }
323         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
324         try:
325             self.report_login()
326             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
327             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
328                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
329                 return
330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
332             return
333
334         # Confirm age
335         age_form = {
336                 'next_url':     '/',
337                 'action_confirm':   'Confirm',
338                 }
339         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
340         try:
341             self.report_age_confirmation()
342             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
345             return
346
347     def _extract_id(self, url):
348         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
349         if mobj is None:
350             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
351             return
352         video_id = mobj.group(2)
353         return video_id
354
355     def _real_extract(self, url):
356         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
357         mobj = re.search(self._NEXT_URL_RE, url)
358         if mobj:
359             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
360         video_id = self._extract_id(url)
361
362         # Get video webpage
363         self.report_video_webpage_download(video_id)
364         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
365         request = compat_urllib_request.Request(url)
366         try:
367             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
369             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
370             return
371
372         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
373
374         # Attempt to extract SWF player URL
375         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
376         if mobj is not None:
377             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
378         else:
379             player_url = None
380
381         # Get video info
382         self.report_video_info_webpage_download(video_id)
383         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
384             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
385                     % (video_id, el_type))
386             request = compat_urllib_request.Request(video_info_url)
387             try:
388                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
389                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
390                 video_info = compat_parse_qs(video_info_webpage)
391                 if 'token' in video_info:
392                     break
393             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
394                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
395                 return
396         if 'token' not in video_info:
397             if 'reason' in video_info:
398                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
399             else:
400                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
401             return
402
403         # Check for "rental" videos
404         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
405             self._downloader.trouble(u'ERROR: "rental" videos not supported')
406             return
407
408         # Start extracting information
409         self.report_information_extraction(video_id)
410
411         # uploader
412         if 'author' not in video_info:
413             self._downloader.trouble(u'ERROR: unable to extract uploader name')
414             return
415         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
416
417         # uploader_id
418         video_uploader_id = None
419         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
420         if mobj is not None:
421             video_uploader_id = mobj.group(1)
422         else:
423             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
424
425         # title
426         if 'title' not in video_info:
427             self._downloader.trouble(u'ERROR: unable to extract video title')
428             return
429         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
430
431         # thumbnail image
432         if 'thumbnail_url' not in video_info:
433             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
434             video_thumbnail = ''
435         else:   # don't panic if we can't find it
436             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
437
438         # upload date
439         upload_date = None
440         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
441         if mobj is not None:
442             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
443             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
444             for expression in format_expressions:
445                 try:
446                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
447                 except:
448                     pass
449
450         # description
451         video_description = get_element_by_id("eow-description", video_webpage)
452         if video_description:
453             video_description = clean_html(video_description)
454         else:
455             video_description = ''
456
457         # closed captions
458         video_subtitles = None
459         if self._downloader.params.get('writesubtitles', False):
460             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
461             if srt_error:
462                 self._downloader.trouble(srt_error)
463
464         if 'length_seconds' not in video_info:
465             self._downloader.trouble(u'WARNING: unable to extract video duration')
466             video_duration = ''
467         else:
468             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
469
470         # token
471         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
472
473         # Decide which formats to download
474         req_format = self._downloader.params.get('format', None)
475
476         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
477             self.report_rtmp_download()
478             video_url_list = [(None, video_info['conn'][0])]
479         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
480             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
481             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
482             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
483             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
484
485             format_limit = self._downloader.params.get('format_limit', None)
486             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
487             if format_limit is not None and format_limit in available_formats:
488                 format_list = available_formats[available_formats.index(format_limit):]
489             else:
490                 format_list = available_formats
491             existing_formats = [x for x in format_list if x in url_map]
492             if len(existing_formats) == 0:
493                 self._downloader.trouble(u'ERROR: no known formats available for video')
494                 return
495             if self._downloader.params.get('listformats', None):
496                 self._print_formats(existing_formats)
497                 return
498             if req_format is None or req_format == 'best':
499                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
500             elif req_format == 'worst':
501                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
502             elif req_format in ('-1', 'all'):
503                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
504             else:
505                 # Specific formats. We pick the first in a slash-delimeted sequence.
506                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
507                 req_formats = req_format.split('/')
508                 video_url_list = None
509                 for rf in req_formats:
510                     if rf in url_map:
511                         video_url_list = [(rf, url_map[rf])]
512                         break
513                 if video_url_list is None:
514                     self._downloader.trouble(u'ERROR: requested format not available')
515                     return
516         else:
517             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
518             return
519
520         results = []
521         for format_param, video_real_url in video_url_list:
522             # Extension
523             video_extension = self._video_extensions.get(format_param, 'flv')
524
525             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
526                                               self._video_dimensions.get(format_param, '???'))
527
528             results.append({
529                 'id':       video_id,
530                 'url':      video_real_url,
531                 'uploader': video_uploader,
532                 'uploader_id': video_uploader_id,
533                 'upload_date':  upload_date,
534                 'title':    video_title,
535                 'ext':      video_extension,
536                 'format':   video_format,
537                 'thumbnail':    video_thumbnail,
538                 'description':  video_description,
539                 'player_url':   player_url,
540                 'subtitles':    video_subtitles,
541                 'duration':     video_duration
542             })
543         return results
544
545
546 class MetacafeIE(InfoExtractor):
547     """Information Extractor for metacafe.com."""
548
549     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
550     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
551     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
552     IE_NAME = u'metacafe'
553
554     def __init__(self, downloader=None):
555         InfoExtractor.__init__(self, downloader)
556
557     def report_disclaimer(self):
558         """Report disclaimer retrieval."""
559         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
560
561     def report_age_confirmation(self):
562         """Report attempt to confirm age."""
563         self._downloader.to_screen(u'[metacafe] Confirming age')
564
565     def report_download_webpage(self, video_id):
566         """Report webpage download."""
567         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
568
569     def report_extraction(self, video_id):
570         """Report information extraction."""
571         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
572
573     def _real_initialize(self):
574         # Retrieve disclaimer
575         request = compat_urllib_request.Request(self._DISCLAIMER)
576         try:
577             self.report_disclaimer()
578             disclaimer = compat_urllib_request.urlopen(request).read()
579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
581             return
582
583         # Confirm age
584         disclaimer_form = {
585             'filters': '0',
586             'submit': "Continue - I'm over 18",
587             }
588         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
589         try:
590             self.report_age_confirmation()
591             disclaimer = compat_urllib_request.urlopen(request).read()
592         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
593             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
594             return
595
596     def _real_extract(self, url):
597         # Extract id and simplified title from URL
598         mobj = re.match(self._VALID_URL, url)
599         if mobj is None:
600             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
601             return
602
603         video_id = mobj.group(1)
604
605         # Check if video comes from YouTube
606         mobj2 = re.match(r'^yt-(.*)$', video_id)
607         if mobj2 is not None:
608             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
609             return
610
611         # Retrieve video webpage to extract further information
612         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
613         try:
614             self.report_download_webpage(video_id)
615             webpage = compat_urllib_request.urlopen(request).read()
616         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
617             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
618             return
619
620         # Extract URL, uploader and title from webpage
621         self.report_extraction(video_id)
622         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
623         if mobj is not None:
624             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
625             video_extension = mediaURL[-3:]
626
627             # Extract gdaKey if available
628             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
629             if mobj is None:
630                 video_url = mediaURL
631             else:
632                 gdaKey = mobj.group(1)
633                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
634         else:
635             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
636             if mobj is None:
637                 self._downloader.trouble(u'ERROR: unable to extract media URL')
638                 return
639             vardict = compat_parse_qs(mobj.group(1))
640             if 'mediaData' not in vardict:
641                 self._downloader.trouble(u'ERROR: unable to extract media URL')
642                 return
643             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
644             if mobj is None:
645                 self._downloader.trouble(u'ERROR: unable to extract media URL')
646                 return
647             mediaURL = mobj.group(1).replace('\\/', '/')
648             video_extension = mediaURL[-3:]
649             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
650
651         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
652         if mobj is None:
653             self._downloader.trouble(u'ERROR: unable to extract title')
654             return
655         video_title = mobj.group(1).decode('utf-8')
656
657         mobj = re.search(r'submitter=(.*?);', webpage)
658         if mobj is None:
659             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
660             return
661         video_uploader = mobj.group(1)
662
663         return [{
664             'id':       video_id.decode('utf-8'),
665             'url':      video_url.decode('utf-8'),
666             'uploader': video_uploader.decode('utf-8'),
667             'upload_date':  None,
668             'title':    video_title,
669             'ext':      video_extension.decode('utf-8'),
670         }]
671
672
673 class DailymotionIE(InfoExtractor):
674     """Information Extractor for Dailymotion"""
675
676     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
677     IE_NAME = u'dailymotion'
678
679     def __init__(self, downloader=None):
680         InfoExtractor.__init__(self, downloader)
681
682     def report_extraction(self, video_id):
683         """Report information extraction."""
684         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
685
686     def _real_extract(self, url):
687         # Extract id and simplified title from URL
688         mobj = re.match(self._VALID_URL, url)
689         if mobj is None:
690             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
691             return
692
693         video_id = mobj.group(1).split('_')[0].split('?')[0]
694
695         video_extension = 'mp4'
696
697         # Retrieve video webpage to extract further information
698         request = compat_urllib_request.Request(url)
699         request.add_header('Cookie', 'family_filter=off')
700         webpage = self._download_webpage(request, video_id)
701
702         # Extract URL, uploader and title from webpage
703         self.report_extraction(video_id)
704         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
705         if mobj is None:
706             self._downloader.trouble(u'ERROR: unable to extract media URL')
707             return
708         flashvars = compat_urllib_parse.unquote(mobj.group(1))
709
710         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
711             if key in flashvars:
712                 max_quality = key
713                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
714                 break
715         else:
716             self._downloader.trouble(u'ERROR: unable to extract video URL')
717             return
718
719         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
720         if mobj is None:
721             self._downloader.trouble(u'ERROR: unable to extract video URL')
722             return
723
724         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
725
726         # TODO: support choosing qualities
727
728         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
729         if mobj is None:
730             self._downloader.trouble(u'ERROR: unable to extract title')
731             return
732         video_title = unescapeHTML(mobj.group('title'))
733
734         video_uploader = None
735         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
736         if mobj is None:
737             # lookin for official user
738             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
739             if mobj_official is None:
740                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
741             else:
742                 video_uploader = mobj_official.group(1)
743         else:
744             video_uploader = mobj.group(1)
745
746         video_upload_date = None
747         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
748         if mobj is not None:
749             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
750
751         return [{
752             'id':       video_id,
753             'url':      video_url,
754             'uploader': video_uploader,
755             'upload_date':  video_upload_date,
756             'title':    video_title,
757             'ext':      video_extension,
758         }]
759
760
761 class PhotobucketIE(InfoExtractor):
762     """Information extractor for photobucket.com."""
763
764     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
765     IE_NAME = u'photobucket'
766
767     def __init__(self, downloader=None):
768         InfoExtractor.__init__(self, downloader)
769
770     def report_download_webpage(self, video_id):
771         """Report webpage download."""
772         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
773
774     def report_extraction(self, video_id):
775         """Report information extraction."""
776         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
777
778     def _real_extract(self, url):
779         # Extract id from URL
780         mobj = re.match(self._VALID_URL, url)
781         if mobj is None:
782             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
783             return
784
785         video_id = mobj.group(1)
786
787         video_extension = 'flv'
788
789         # Retrieve video webpage to extract further information
790         request = compat_urllib_request.Request(url)
791         try:
792             self.report_download_webpage(video_id)
793             webpage = compat_urllib_request.urlopen(request).read()
794         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
795             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
796             return
797
798         # Extract URL, uploader, and title from webpage
799         self.report_extraction(video_id)
800         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
801         if mobj is None:
802             self._downloader.trouble(u'ERROR: unable to extract media URL')
803             return
804         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
805
806         video_url = mediaURL
807
808         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
809         if mobj is None:
810             self._downloader.trouble(u'ERROR: unable to extract title')
811             return
812         video_title = mobj.group(1).decode('utf-8')
813
814         video_uploader = mobj.group(2).decode('utf-8')
815
816         return [{
817             'id':       video_id.decode('utf-8'),
818             'url':      video_url.decode('utf-8'),
819             'uploader': video_uploader,
820             'upload_date':  None,
821             'title':    video_title,
822             'ext':      video_extension.decode('utf-8'),
823         }]
824
825
826 class YahooIE(InfoExtractor):
827     """Information extractor for video.yahoo.com."""
828
829     _WORKING = False
830     # _VALID_URL matches all Yahoo! Video URLs
831     # _VPAGE_URL matches only the extractable '/watch/' URLs
832     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
833     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
834     IE_NAME = u'video.yahoo'
835
836     def __init__(self, downloader=None):
837         InfoExtractor.__init__(self, downloader)
838
839     def report_download_webpage(self, video_id):
840         """Report webpage download."""
841         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
842
843     def report_extraction(self, video_id):
844         """Report information extraction."""
845         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
846
847     def _real_extract(self, url, new_video=True):
848         # Extract ID from URL
849         mobj = re.match(self._VALID_URL, url)
850         if mobj is None:
851             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
852             return
853
854         video_id = mobj.group(2)
855         video_extension = 'flv'
856
857         # Rewrite valid but non-extractable URLs as
858         # extractable English language /watch/ URLs
859         if re.match(self._VPAGE_URL, url) is None:
860             request = compat_urllib_request.Request(url)
861             try:
862                 webpage = compat_urllib_request.urlopen(request).read()
863             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
864                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
865                 return
866
867             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
868             if mobj is None:
869                 self._downloader.trouble(u'ERROR: Unable to extract id field')
870                 return
871             yahoo_id = mobj.group(1)
872
873             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
874             if mobj is None:
875                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
876                 return
877             yahoo_vid = mobj.group(1)
878
879             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
880             return self._real_extract(url, new_video=False)
881
882         # Retrieve video webpage to extract further information
883         request = compat_urllib_request.Request(url)
884         try:
885             self.report_download_webpage(video_id)
886             webpage = compat_urllib_request.urlopen(request).read()
887         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
888             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
889             return
890
891         # Extract uploader and title from webpage
892         self.report_extraction(video_id)
893         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
894         if mobj is None:
895             self._downloader.trouble(u'ERROR: unable to extract video title')
896             return
897         video_title = mobj.group(1).decode('utf-8')
898
899         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
900         if mobj is None:
901             self._downloader.trouble(u'ERROR: unable to extract video uploader')
902             return
903         video_uploader = mobj.group(1).decode('utf-8')
904
905         # Extract video thumbnail
906         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
907         if mobj is None:
908             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
909             return
910         video_thumbnail = mobj.group(1).decode('utf-8')
911
912         # Extract video description
913         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
914         if mobj is None:
915             self._downloader.trouble(u'ERROR: unable to extract video description')
916             return
917         video_description = mobj.group(1).decode('utf-8')
918         if not video_description:
919             video_description = 'No description available.'
920
921         # Extract video height and width
922         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
923         if mobj is None:
924             self._downloader.trouble(u'ERROR: unable to extract video height')
925             return
926         yv_video_height = mobj.group(1)
927
928         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
929         if mobj is None:
930             self._downloader.trouble(u'ERROR: unable to extract video width')
931             return
932         yv_video_width = mobj.group(1)
933
934         # Retrieve video playlist to extract media URL
935         # I'm not completely sure what all these options are, but we
936         # seem to need most of them, otherwise the server sends a 401.
937         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
938         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
939         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
940                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
941                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
942         try:
943             self.report_download_webpage(video_id)
944             webpage = compat_urllib_request.urlopen(request).read()
945         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
946             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
947             return
948
949         # Extract media URL from playlist XML
950         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
951         if mobj is None:
952             self._downloader.trouble(u'ERROR: Unable to extract media URL')
953             return
954         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
955         video_url = unescapeHTML(video_url)
956
957         return [{
958             'id':       video_id.decode('utf-8'),
959             'url':      video_url,
960             'uploader': video_uploader,
961             'upload_date':  None,
962             'title':    video_title,
963             'ext':      video_extension.decode('utf-8'),
964             'thumbnail':    video_thumbnail.decode('utf-8'),
965             'description':  video_description,
966         }]
967
968
969 class VimeoIE(InfoExtractor):
970     """Information extractor for vimeo.com."""
971
972     # _VALID_URL matches Vimeo URLs
973     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
974     IE_NAME = u'vimeo'
975
976     def __init__(self, downloader=None):
977         InfoExtractor.__init__(self, downloader)
978
979     def report_download_webpage(self, video_id):
980         """Report webpage download."""
981         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
982
983     def report_extraction(self, video_id):
984         """Report information extraction."""
985         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
986
987     def _real_extract(self, url, new_video=True):
988         # Extract ID from URL
989         mobj = re.match(self._VALID_URL, url)
990         if mobj is None:
991             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
992             return
993
994         video_id = mobj.group(1)
995
996         # Retrieve video webpage to extract further information
997         request = compat_urllib_request.Request(url, None, std_headers)
998         try:
999             self.report_download_webpage(video_id)
1000             webpage_bytes = compat_urllib_request.urlopen(request).read()
1001             webpage = webpage_bytes.decode('utf-8')
1002         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1003             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1004             return
1005
1006         # Now we begin extracting as much information as we can from what we
1007         # retrieved. First we extract the information common to all extractors,
1008         # and latter we extract those that are Vimeo specific.
1009         self.report_extraction(video_id)
1010
1011         # Extract the config JSON
1012         try:
1013             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1014             config = json.loads(config)
1015         except:
1016             self._downloader.trouble(u'ERROR: unable to extract info section')
1017             return
1018
1019         # Extract title
1020         video_title = config["video"]["title"]
1021
1022         # Extract uploader and uploader_id
1023         video_uploader = config["video"]["owner"]["name"]
1024         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1025
1026         # Extract video thumbnail
1027         video_thumbnail = config["video"]["thumbnail"]
1028
1029         # Extract video description
1030         video_description = get_element_by_attribute("itemprop", "description", webpage)
1031         if video_description: video_description = clean_html(video_description)
1032         else: video_description = ''
1033
1034         # Extract upload date
1035         video_upload_date = None
1036         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1037         if mobj is not None:
1038             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1039
1040         # Vimeo specific: extract request signature and timestamp
1041         sig = config['request']['signature']
1042         timestamp = config['request']['timestamp']
1043
1044         # Vimeo specific: extract video codec and quality information
1045         # First consider quality, then codecs, then take everything
1046         # TODO bind to format param
1047         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1048         files = { 'hd': [], 'sd': [], 'other': []}
1049         for codec_name, codec_extension in codecs:
1050             if codec_name in config["video"]["files"]:
1051                 if 'hd' in config["video"]["files"][codec_name]:
1052                     files['hd'].append((codec_name, codec_extension, 'hd'))
1053                 elif 'sd' in config["video"]["files"][codec_name]:
1054                     files['sd'].append((codec_name, codec_extension, 'sd'))
1055                 else:
1056                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1057
1058         for quality in ('hd', 'sd', 'other'):
1059             if len(files[quality]) > 0:
1060                 video_quality = files[quality][0][2]
1061                 video_codec = files[quality][0][0]
1062                 video_extension = files[quality][0][1]
1063                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1064                 break
1065         else:
1066             self._downloader.trouble(u'ERROR: no known codec found')
1067             return
1068
1069         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071
1072         return [{
1073             'id':       video_id,
1074             'url':      video_url,
1075             'uploader': video_uploader,
1076             'uploader_id': video_uploader_id,
1077             'upload_date':  video_upload_date,
1078             'title':    video_title,
1079             'ext':      video_extension,
1080             'thumbnail':    video_thumbnail,
1081             'description':  video_description,
1082         }]
1083
1084
1085 class ArteTvIE(InfoExtractor):
1086     """arte.tv information extractor."""
1087
1088     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089     _LIVE_URL = r'index-[0-9]+\.html$'
1090
1091     IE_NAME = u'arte.tv'
1092
1093     def __init__(self, downloader=None):
1094         InfoExtractor.__init__(self, downloader)
1095
1096     def report_download_webpage(self, video_id):
1097         """Report webpage download."""
1098         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1099
1100     def report_extraction(self, video_id):
1101         """Report information extraction."""
1102         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1103
1104     def fetch_webpage(self, url):
1105         request = compat_urllib_request.Request(url)
1106         try:
1107             self.report_download_webpage(url)
1108             webpage = compat_urllib_request.urlopen(request).read()
1109         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1110             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1111             return
1112         except ValueError as err:
1113             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1114             return
1115         return webpage
1116
1117     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1118         page = self.fetch_webpage(url)
1119         mobj = re.search(regex, page, regexFlags)
1120         info = {}
1121
1122         if mobj is None:
1123             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1124             return
1125
1126         for (i, key, err) in matchTuples:
1127             if mobj.group(i) is None:
1128                 self._downloader.trouble(err)
1129                 return
1130             else:
1131                 info[key] = mobj.group(i)
1132
1133         return info
1134
1135     def extractLiveStream(self, url):
1136         video_lang = url.split('/')[-4]
1137         info = self.grep_webpage(
1138             url,
1139             r'src="(.*?/videothek_js.*?\.js)',
1140             0,
1141             [
1142                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1143             ]
1144         )
1145         http_host = url.split('/')[2]
1146         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1147         info = self.grep_webpage(
1148             next_url,
1149             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1150                 '(http://.*?\.swf).*?' +
1151                 '(rtmp://.*?)\'',
1152             re.DOTALL,
1153             [
1154                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1155                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1156                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1157             ]
1158         )
1159         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1160
1161     def extractPlus7Stream(self, url):
1162         video_lang = url.split('/')[-3]
1163         info = self.grep_webpage(
1164             url,
1165             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1166             0,
1167             [
1168                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1169             ]
1170         )
1171         next_url = compat_urllib_parse.unquote(info.get('url'))
1172         info = self.grep_webpage(
1173             next_url,
1174             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1175             0,
1176             [
1177                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1178             ]
1179         )
1180         next_url = compat_urllib_parse.unquote(info.get('url'))
1181
1182         info = self.grep_webpage(
1183             next_url,
1184             r'<video id="(.*?)".*?>.*?' +
1185                 '<name>(.*?)</name>.*?' +
1186                 '<dateVideo>(.*?)</dateVideo>.*?' +
1187                 '<url quality="hd">(.*?)</url>',
1188             re.DOTALL,
1189             [
1190                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1191                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1192                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1193                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1194             ]
1195         )
1196
1197         return {
1198             'id':           info.get('id'),
1199             'url':          compat_urllib_parse.unquote(info.get('url')),
1200             'uploader':     u'arte.tv',
1201             'upload_date':  info.get('date'),
1202             'title':        info.get('title').decode('utf-8'),
1203             'ext':          u'mp4',
1204             'format':       u'NA',
1205             'player_url':   None,
1206         }
1207
1208     def _real_extract(self, url):
1209         video_id = url.split('/')[-1]
1210         self.report_extraction(video_id)
1211
1212         if re.search(self._LIVE_URL, video_id) is not None:
1213             self.extractLiveStream(url)
1214             return
1215         else:
1216             info = self.extractPlus7Stream(url)
1217
1218         return [info]
1219
1220
1221 class GenericIE(InfoExtractor):
1222     """Generic last-resort information extractor."""
1223
1224     _VALID_URL = r'.*'
1225     IE_NAME = u'generic'
1226
1227     def __init__(self, downloader=None):
1228         InfoExtractor.__init__(self, downloader)
1229
1230     def report_download_webpage(self, video_id):
1231         """Report webpage download."""
1232         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1233         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1234
1235     def report_extraction(self, video_id):
1236         """Report information extraction."""
1237         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1238
1239     def report_following_redirect(self, new_url):
1240         """Report information extraction."""
1241         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1242
1243     def _test_redirect(self, url):
1244         """Check if it is a redirect, like url shorteners, in case restart chain."""
1245         class HeadRequest(compat_urllib_request.Request):
1246             def get_method(self):
1247                 return "HEAD"
1248
1249         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1250             """
1251             Subclass the HTTPRedirectHandler to make it use our
1252             HeadRequest also on the redirected URL
1253             """
1254             def redirect_request(self, req, fp, code, msg, headers, newurl):
1255                 if code in (301, 302, 303, 307):
1256                     newurl = newurl.replace(' ', '%20')
1257                     newheaders = dict((k,v) for k,v in req.headers.items()
1258                                       if k.lower() not in ("content-length", "content-type"))
1259                     return HeadRequest(newurl,
1260                                        headers=newheaders,
1261                                        origin_req_host=req.get_origin_req_host(),
1262                                        unverifiable=True)
1263                 else:
1264                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1265
1266         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1267             """
1268             Fallback to GET if HEAD is not allowed (405 HTTP error)
1269             """
1270             def http_error_405(self, req, fp, code, msg, headers):
1271                 fp.read()
1272                 fp.close()
1273
1274                 newheaders = dict((k,v) for k,v in req.headers.items()
1275                                   if k.lower() not in ("content-length", "content-type"))
1276                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1277                                                  headers=newheaders,
1278                                                  origin_req_host=req.get_origin_req_host(),
1279                                                  unverifiable=True))
1280
1281         # Build our opener
1282         opener = compat_urllib_request.OpenerDirector()
1283         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1284                         HTTPMethodFallback, HEADRedirectHandler,
1285                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1286             opener.add_handler(handler())
1287
1288         response = opener.open(HeadRequest(url))
1289         new_url = response.geturl()
1290
1291         if url == new_url:
1292             return False
1293
1294         self.report_following_redirect(new_url)
1295         self._downloader.download([new_url])
1296         return True
1297
1298     def _real_extract(self, url):
1299         if self._test_redirect(url): return
1300
1301         video_id = url.split('/')[-1]
1302         request = compat_urllib_request.Request(url)
1303         try:
1304             self.report_download_webpage(video_id)
1305             webpage = compat_urllib_request.urlopen(request).read()
1306         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1307             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1308             return
1309         except ValueError as err:
1310             # since this is the last-resort InfoExtractor, if
1311             # this error is thrown, it'll be thrown here
1312             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1313             return
1314
1315         self.report_extraction(video_id)
1316         # Start with something easy: JW Player in SWFObject
1317         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1318         if mobj is None:
1319             # Broaden the search a little bit
1320             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1321         if mobj is None:
1322             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1323             return
1324
1325         # It's possible that one of the regexes
1326         # matched, but returned an empty group:
1327         if mobj.group(1) is None:
1328             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1329             return
1330
1331         video_url = compat_urllib_parse.unquote(mobj.group(1))
1332         video_id = os.path.basename(video_url)
1333
1334         # here's a fun little line of code for you:
1335         video_extension = os.path.splitext(video_id)[1][1:]
1336         video_id = os.path.splitext(video_id)[0]
1337
1338         # it's tempting to parse this further, but you would
1339         # have to take into account all the variations like
1340         #   Video Title - Site Name
1341         #   Site Name | Video Title
1342         #   Video Title - Tagline | Site Name
1343         # and so on and so forth; it's just not practical
1344         mobj = re.search(r'<title>(.*)</title>', webpage)
1345         if mobj is None:
1346             self._downloader.trouble(u'ERROR: unable to extract title')
1347             return
1348         video_title = mobj.group(1)
1349
1350         # video uploader is domain name
1351         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1352         if mobj is None:
1353             self._downloader.trouble(u'ERROR: unable to extract title')
1354             return
1355         video_uploader = mobj.group(1)
1356
1357         return [{
1358             'id':       video_id,
1359             'url':      video_url,
1360             'uploader': video_uploader,
1361             'upload_date':  None,
1362             'title':    video_title,
1363             'ext':      video_extension,
1364         }]
1365
1366
1367 class YoutubeSearchIE(InfoExtractor):
1368     """Information Extractor for YouTube search queries."""
1369     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1370     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1371     _max_youtube_results = 1000
1372     IE_NAME = u'youtube:search'
1373
1374     def __init__(self, downloader=None):
1375         InfoExtractor.__init__(self, downloader)
1376
1377     def report_download_page(self, query, pagenum):
1378         """Report attempt to download search page with given number."""
1379         query = query.decode(preferredencoding())
1380         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1381
1382     def _real_extract(self, query):
1383         mobj = re.match(self._VALID_URL, query)
1384         if mobj is None:
1385             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1386             return
1387
1388         prefix, query = query.split(':')
1389         prefix = prefix[8:]
1390         query = query.encode('utf-8')
1391         if prefix == '':
1392             self._download_n_results(query, 1)
1393             return
1394         elif prefix == 'all':
1395             self._download_n_results(query, self._max_youtube_results)
1396             return
1397         else:
1398             try:
1399                 n = int(prefix)
1400                 if n <= 0:
1401                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1402                     return
1403                 elif n > self._max_youtube_results:
1404                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1405                     n = self._max_youtube_results
1406                 self._download_n_results(query, n)
1407                 return
1408             except ValueError: # parsing prefix as integer fails
1409                 self._download_n_results(query, 1)
1410                 return
1411
1412     def _download_n_results(self, query, n):
1413         """Downloads a specified number of results for a query"""
1414
1415         video_ids = []
1416         pagenum = 0
1417         limit = n
1418
1419         while (50 * pagenum) < limit:
1420             self.report_download_page(query, pagenum+1)
1421             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1422             request = compat_urllib_request.Request(result_url)
1423             try:
1424                 data = compat_urllib_request.urlopen(request).read()
1425             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1426                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1427                 return
1428             api_response = json.loads(data)['data']
1429
1430             new_ids = list(video['id'] for video in api_response['items'])
1431             video_ids += new_ids
1432
1433             limit = min(n, api_response['totalItems'])
1434             pagenum += 1
1435
1436         if len(video_ids) > n:
1437             video_ids = video_ids[:n]
1438         for id in video_ids:
1439             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1440         return
1441
1442
1443 class GoogleSearchIE(InfoExtractor):
1444     """Information Extractor for Google Video search queries."""
1445     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1446     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1447     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1448     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1449     _max_google_results = 1000
1450     IE_NAME = u'video.google:search'
1451
1452     def __init__(self, downloader=None):
1453         InfoExtractor.__init__(self, downloader)
1454
1455     def report_download_page(self, query, pagenum):
1456         """Report attempt to download playlist page with given number."""
1457         query = query.decode(preferredencoding())
1458         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1459
1460     def _real_extract(self, query):
1461         mobj = re.match(self._VALID_URL, query)
1462         if mobj is None:
1463             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1464             return
1465
1466         prefix, query = query.split(':')
1467         prefix = prefix[8:]
1468         query = query.encode('utf-8')
1469         if prefix == '':
1470             self._download_n_results(query, 1)
1471             return
1472         elif prefix == 'all':
1473             self._download_n_results(query, self._max_google_results)
1474             return
1475         else:
1476             try:
1477                 n = int(prefix)
1478                 if n <= 0:
1479                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1480                     return
1481                 elif n > self._max_google_results:
1482                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1483                     n = self._max_google_results
1484                 self._download_n_results(query, n)
1485                 return
1486             except ValueError: # parsing prefix as integer fails
1487                 self._download_n_results(query, 1)
1488                 return
1489
1490     def _download_n_results(self, query, n):
1491         """Downloads a specified number of results for a query"""
1492
1493         video_ids = []
1494         pagenum = 0
1495
1496         while True:
1497             self.report_download_page(query, pagenum)
1498             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1499             request = compat_urllib_request.Request(result_url)
1500             try:
1501                 page = compat_urllib_request.urlopen(request).read()
1502             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1503                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1504                 return
1505
1506             # Extract video identifiers
1507             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1508                 video_id = mobj.group(1)
1509                 if video_id not in video_ids:
1510                     video_ids.append(video_id)
1511                     if len(video_ids) == n:
1512                         # Specified n videos reached
1513                         for id in video_ids:
1514                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1515                         return
1516
1517             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1518                 for id in video_ids:
1519                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1520                 return
1521
1522             pagenum = pagenum + 1
1523
1524
1525 class YahooSearchIE(InfoExtractor):
1526     """Information Extractor for Yahoo! Video search queries."""
1527
1528     _WORKING = False
1529     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1530     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1531     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1532     _MORE_PAGES_INDICATOR = r'\s*Next'
1533     _max_yahoo_results = 1000
1534     IE_NAME = u'video.yahoo:search'
1535
1536     def __init__(self, downloader=None):
1537         InfoExtractor.__init__(self, downloader)
1538
1539     def report_download_page(self, query, pagenum):
1540         """Report attempt to download playlist page with given number."""
1541         query = query.decode(preferredencoding())
1542         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1543
1544     def _real_extract(self, query):
1545         mobj = re.match(self._VALID_URL, query)
1546         if mobj is None:
1547             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1548             return
1549
1550         prefix, query = query.split(':')
1551         prefix = prefix[8:]
1552         query = query.encode('utf-8')
1553         if prefix == '':
1554             self._download_n_results(query, 1)
1555             return
1556         elif prefix == 'all':
1557             self._download_n_results(query, self._max_yahoo_results)
1558             return
1559         else:
1560             try:
1561                 n = int(prefix)
1562                 if n <= 0:
1563                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1564                     return
1565                 elif n > self._max_yahoo_results:
1566                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1567                     n = self._max_yahoo_results
1568                 self._download_n_results(query, n)
1569                 return
1570             except ValueError: # parsing prefix as integer fails
1571                 self._download_n_results(query, 1)
1572                 return
1573
1574     def _download_n_results(self, query, n):
1575         """Downloads a specified number of results for a query"""
1576
1577         video_ids = []
1578         already_seen = set()
1579         pagenum = 1
1580
1581         while True:
1582             self.report_download_page(query, pagenum)
1583             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1584             request = compat_urllib_request.Request(result_url)
1585             try:
1586                 page = compat_urllib_request.urlopen(request).read()
1587             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1588                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1589                 return
1590
1591             # Extract video identifiers
1592             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593                 video_id = mobj.group(1)
1594                 if video_id not in already_seen:
1595                     video_ids.append(video_id)
1596                     already_seen.add(video_id)
1597                     if len(video_ids) == n:
1598                         # Specified n videos reached
1599                         for id in video_ids:
1600                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1601                         return
1602
1603             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1604                 for id in video_ids:
1605                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1606                 return
1607
1608             pagenum = pagenum + 1
1609
1610
1611 class YoutubePlaylistIE(InfoExtractor):
1612     """Information Extractor for YouTube playlists."""
1613
1614     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1615     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1616     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1617     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1618     IE_NAME = u'youtube:playlist'
1619
1620     def __init__(self, downloader=None):
1621         InfoExtractor.__init__(self, downloader)
1622
1623     def report_download_page(self, playlist_id, pagenum):
1624         """Report attempt to download playlist page with given number."""
1625         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1626
1627     def _real_extract(self, url):
1628         # Extract playlist id
1629         mobj = re.match(self._VALID_URL, url)
1630         if mobj is None:
1631             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1632             return
1633
1634         # Single video case
1635         if mobj.group(3) is not None:
1636             self._downloader.download([mobj.group(3)])
1637             return
1638
1639         # Download playlist pages
1640         # prefix is 'p' as default for playlists but there are other types that need extra care
1641         playlist_prefix = mobj.group(1)
1642         if playlist_prefix == 'a':
1643             playlist_access = 'artist'
1644         else:
1645             playlist_prefix = 'p'
1646             playlist_access = 'view_play_list'
1647         playlist_id = mobj.group(2)
1648         video_ids = []
1649         pagenum = 1
1650
1651         while True:
1652             self.report_download_page(playlist_id, pagenum)
1653             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1654             request = compat_urllib_request.Request(url)
1655             try:
1656                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1657             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1658                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1659                 return
1660
1661             # Extract video identifiers
1662             ids_in_page = []
1663             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1664                 if mobj.group(1) not in ids_in_page:
1665                     ids_in_page.append(mobj.group(1))
1666             video_ids.extend(ids_in_page)
1667
1668             if self._MORE_PAGES_INDICATOR not in page:
1669                 break
1670             pagenum = pagenum + 1
1671
1672         total = len(video_ids)
1673
1674         playliststart = self._downloader.params.get('playliststart', 1) - 1
1675         playlistend = self._downloader.params.get('playlistend', -1)
1676         if playlistend == -1:
1677             video_ids = video_ids[playliststart:]
1678         else:
1679             video_ids = video_ids[playliststart:playlistend]
1680
1681         if len(video_ids) == total:
1682             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1683         else:
1684             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1685
1686         for id in video_ids:
1687             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1688         return
1689
1690
1691 class YoutubeChannelIE(InfoExtractor):
1692     """Information Extractor for YouTube channels."""
1693
1694     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1695     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1696     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1697     IE_NAME = u'youtube:channel'
1698
1699     def report_download_page(self, channel_id, pagenum):
1700         """Report attempt to download channel page with given number."""
1701         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1702
1703     def _real_extract(self, url):
1704         # Extract channel id
1705         mobj = re.match(self._VALID_URL, url)
1706         if mobj is None:
1707             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1708             return
1709
1710         # Download channel pages
1711         channel_id = mobj.group(1)
1712         video_ids = []
1713         pagenum = 1
1714
1715         while True:
1716             self.report_download_page(channel_id, pagenum)
1717             url = self._TEMPLATE_URL % (channel_id, pagenum)
1718             request = compat_urllib_request.Request(url)
1719             try:
1720                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1721             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1722                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1723                 return
1724
1725             # Extract video identifiers
1726             ids_in_page = []
1727             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1728                 if mobj.group(1) not in ids_in_page:
1729                     ids_in_page.append(mobj.group(1))
1730             video_ids.extend(ids_in_page)
1731
1732             if self._MORE_PAGES_INDICATOR not in page:
1733                 break
1734             pagenum = pagenum + 1
1735
1736         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1737
1738         for id in video_ids:
1739             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1740         return
1741
1742
1743 class YoutubeUserIE(InfoExtractor):
1744     """Information Extractor for YouTube users."""
1745
1746     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1747     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1748     _GDATA_PAGE_SIZE = 50
1749     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1750     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1751     IE_NAME = u'youtube:user'
1752
1753     def __init__(self, downloader=None):
1754         InfoExtractor.__init__(self, downloader)
1755
1756     def report_download_page(self, username, start_index):
1757         """Report attempt to download user page."""
1758         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1759                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1760
1761     def _real_extract(self, url):
1762         # Extract username
1763         mobj = re.match(self._VALID_URL, url)
1764         if mobj is None:
1765             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1766             return
1767
1768         username = mobj.group(1)
1769
1770         # Download video ids using YouTube Data API. Result size per
1771         # query is limited (currently to 50 videos) so we need to query
1772         # page by page until there are no video ids - it means we got
1773         # all of them.
1774
1775         video_ids = []
1776         pagenum = 0
1777
1778         while True:
1779             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1780             self.report_download_page(username, start_index)
1781
1782             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1783
1784             try:
1785                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1786             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1788                 return
1789
1790             # Extract video identifiers
1791             ids_in_page = []
1792
1793             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1794                 if mobj.group(1) not in ids_in_page:
1795                     ids_in_page.append(mobj.group(1))
1796
1797             video_ids.extend(ids_in_page)
1798
1799             # A little optimization - if current page is not
1800             # "full", ie. does not contain PAGE_SIZE video ids then
1801             # we can assume that this page is the last one - there
1802             # are no more ids on further pages - no need to query
1803             # again.
1804
1805             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1806                 break
1807
1808             pagenum += 1
1809
1810         all_ids_count = len(video_ids)
1811         playliststart = self._downloader.params.get('playliststart', 1) - 1
1812         playlistend = self._downloader.params.get('playlistend', -1)
1813
1814         if playlistend == -1:
1815             video_ids = video_ids[playliststart:]
1816         else:
1817             video_ids = video_ids[playliststart:playlistend]
1818
1819         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1820                 (username, all_ids_count, len(video_ids)))
1821
1822         for video_id in video_ids:
1823             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1824
1825
1826 class BlipTVUserIE(InfoExtractor):
1827     """Information Extractor for blip.tv users."""
1828
1829     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1830     _PAGE_SIZE = 12
1831     IE_NAME = u'blip.tv:user'
1832
1833     def __init__(self, downloader=None):
1834         InfoExtractor.__init__(self, downloader)
1835
1836     def report_download_page(self, username, pagenum):
1837         """Report attempt to download user page."""
1838         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1839                 (self.IE_NAME, username, pagenum))
1840
1841     def _real_extract(self, url):
1842         # Extract username
1843         mobj = re.match(self._VALID_URL, url)
1844         if mobj is None:
1845             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1846             return
1847
1848         username = mobj.group(1)
1849
1850         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1851
1852         request = compat_urllib_request.Request(url)
1853
1854         try:
1855             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856             mobj = re.search(r'data-users-id="([^"]+)"', page)
1857             page_base = page_base % mobj.group(1)
1858         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1860             return
1861
1862
1863         # Download video ids using BlipTV Ajax calls. Result size per
1864         # query is limited (currently to 12 videos) so we need to query
1865         # page by page until there are no video ids - it means we got
1866         # all of them.
1867
1868         video_ids = []
1869         pagenum = 1
1870
1871         while True:
1872             self.report_download_page(username, pagenum)
1873
1874             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1875
1876             try:
1877                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1878             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1879                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1880                 return
1881
1882             # Extract video identifiers
1883             ids_in_page = []
1884
1885             for mobj in re.finditer(r'href="/([^"]+)"', page):
1886                 if mobj.group(1) not in ids_in_page:
1887                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1888
1889             video_ids.extend(ids_in_page)
1890
1891             # A little optimization - if current page is not
1892             # "full", ie. does not contain PAGE_SIZE video ids then
1893             # we can assume that this page is the last one - there
1894             # are no more ids on further pages - no need to query
1895             # again.
1896
1897             if len(ids_in_page) < self._PAGE_SIZE:
1898                 break
1899
1900             pagenum += 1
1901
1902         all_ids_count = len(video_ids)
1903         playliststart = self._downloader.params.get('playliststart', 1) - 1
1904         playlistend = self._downloader.params.get('playlistend', -1)
1905
1906         if playlistend == -1:
1907             video_ids = video_ids[playliststart:]
1908         else:
1909             video_ids = video_ids[playliststart:playlistend]
1910
1911         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1912                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1913
1914         for video_id in video_ids:
1915             self._downloader.download([u'http://blip.tv/'+video_id])
1916
1917
1918 class DepositFilesIE(InfoExtractor):
1919     """Information extractor for depositfiles.com"""
1920
1921     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1922
1923     def report_download_webpage(self, file_id):
1924         """Report webpage download."""
1925         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1926
1927     def report_extraction(self, file_id):
1928         """Report information extraction."""
1929         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1930
1931     def _real_extract(self, url):
1932         file_id = url.split('/')[-1]
1933         # Rebuild url in english locale
1934         url = 'http://depositfiles.com/en/files/' + file_id
1935
1936         # Retrieve file webpage with 'Free download' button pressed
1937         free_download_indication = { 'gateway_result' : '1' }
1938         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1939         try:
1940             self.report_download_webpage(file_id)
1941             webpage = compat_urllib_request.urlopen(request).read()
1942         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1943             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1944             return
1945
1946         # Search for the real file URL
1947         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1948         if (mobj is None) or (mobj.group(1) is None):
1949             # Try to figure out reason of the error.
1950             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1951             if (mobj is not None) and (mobj.group(1) is not None):
1952                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1953                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1954             else:
1955                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1956             return
1957
1958         file_url = mobj.group(1)
1959         file_extension = os.path.splitext(file_url)[1][1:]
1960
1961         # Search for file title
1962         mobj = re.search(r'<b title="(.*?)">', webpage)
1963         if mobj is None:
1964             self._downloader.trouble(u'ERROR: unable to extract title')
1965             return
1966         file_title = mobj.group(1).decode('utf-8')
1967
1968         return [{
1969             'id':       file_id.decode('utf-8'),
1970             'url':      file_url.decode('utf-8'),
1971             'uploader': None,
1972             'upload_date':  None,
1973             'title':    file_title,
1974             'ext':      file_extension.decode('utf-8'),
1975         }]
1976
1977
1978 class FacebookIE(InfoExtractor):
1979     """Information Extractor for Facebook"""
1980
1981     _WORKING = False
1982     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1983     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1984     _NETRC_MACHINE = 'facebook'
1985     _available_formats = ['video', 'highqual', 'lowqual']
1986     _video_extensions = {
1987         'video': 'mp4',
1988         'highqual': 'mp4',
1989         'lowqual': 'mp4',
1990     }
1991     IE_NAME = u'facebook'
1992
1993     def __init__(self, downloader=None):
1994         InfoExtractor.__init__(self, downloader)
1995
1996     def _reporter(self, message):
1997         """Add header and report message."""
1998         self._downloader.to_screen(u'[facebook] %s' % message)
1999
2000     def report_login(self):
2001         """Report attempt to log in."""
2002         self._reporter(u'Logging in')
2003
2004     def report_video_webpage_download(self, video_id):
2005         """Report attempt to download video webpage."""
2006         self._reporter(u'%s: Downloading video webpage' % video_id)
2007
2008     def report_information_extraction(self, video_id):
2009         """Report attempt to extract video information."""
2010         self._reporter(u'%s: Extracting video information' % video_id)
2011
2012     def _parse_page(self, video_webpage):
2013         """Extract video information from page"""
2014         # General data
2015         data = {'title': r'\("video_title", "(.*?)"\)',
2016             'description': r'<div class="datawrap">(.*?)</div>',
2017             'owner': r'\("video_owner_name", "(.*?)"\)',
2018             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2019             }
2020         video_info = {}
2021         for piece in data.keys():
2022             mobj = re.search(data[piece], video_webpage)
2023             if mobj is not None:
2024                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2025
2026         # Video urls
2027         video_urls = {}
2028         for fmt in self._available_formats:
2029             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2030             if mobj is not None:
2031                 # URL is in a Javascript segment inside an escaped Unicode format within
2032                 # the generally utf-8 page
2033                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2034         video_info['video_urls'] = video_urls
2035
2036         return video_info
2037
2038     def _real_initialize(self):
2039         if self._downloader is None:
2040             return
2041
2042         useremail = None
2043         password = None
2044         downloader_params = self._downloader.params
2045
2046         # Attempt to use provided username and password or .netrc data
2047         if downloader_params.get('username', None) is not None:
2048             useremail = downloader_params['username']
2049             password = downloader_params['password']
2050         elif downloader_params.get('usenetrc', False):
2051             try:
2052                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2053                 if info is not None:
2054                     useremail = info[0]
2055                     password = info[2]
2056                 else:
2057                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2058             except (IOError, netrc.NetrcParseError) as err:
2059                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2060                 return
2061
2062         if useremail is None:
2063             return
2064
2065         # Log in
2066         login_form = {
2067             'email': useremail,
2068             'pass': password,
2069             'login': 'Log+In'
2070             }
2071         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2072         try:
2073             self.report_login()
2074             login_results = compat_urllib_request.urlopen(request).read()
2075             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2076                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2077                 return
2078         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2079             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2080             return
2081
2082     def _real_extract(self, url):
2083         mobj = re.match(self._VALID_URL, url)
2084         if mobj is None:
2085             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2086             return
2087         video_id = mobj.group('ID')
2088
2089         # Get video webpage
2090         self.report_video_webpage_download(video_id)
2091         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2092         try:
2093             page = compat_urllib_request.urlopen(request)
2094             video_webpage = page.read()
2095         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2096             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2097             return
2098
2099         # Start extracting information
2100         self.report_information_extraction(video_id)
2101
2102         # Extract information
2103         video_info = self._parse_page(video_webpage)
2104
2105         # uploader
2106         if 'owner' not in video_info:
2107             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2108             return
2109         video_uploader = video_info['owner']
2110
2111         # title
2112         if 'title' not in video_info:
2113             self._downloader.trouble(u'ERROR: unable to extract video title')
2114             return
2115         video_title = video_info['title']
2116         video_title = video_title.decode('utf-8')
2117
2118         # thumbnail image
2119         if 'thumbnail' not in video_info:
2120             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2121             video_thumbnail = ''
2122         else:
2123             video_thumbnail = video_info['thumbnail']
2124
2125         # upload date
2126         upload_date = None
2127         if 'upload_date' in video_info:
2128             upload_time = video_info['upload_date']
2129             timetuple = email.utils.parsedate_tz(upload_time)
2130             if timetuple is not None:
2131                 try:
2132                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2133                 except:
2134                     pass
2135
2136         # description
2137         video_description = video_info.get('description', 'No description available.')
2138
2139         url_map = video_info['video_urls']
2140         if url_map:
2141             # Decide which formats to download
2142             req_format = self._downloader.params.get('format', None)
2143             format_limit = self._downloader.params.get('format_limit', None)
2144
2145             if format_limit is not None and format_limit in self._available_formats:
2146                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2147             else:
2148                 format_list = self._available_formats
2149             existing_formats = [x for x in format_list if x in url_map]
2150             if len(existing_formats) == 0:
2151                 self._downloader.trouble(u'ERROR: no known formats available for video')
2152                 return
2153             if req_format is None:
2154                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2155             elif req_format == 'worst':
2156                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2157             elif req_format == '-1':
2158                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2159             else:
2160                 # Specific format
2161                 if req_format not in url_map:
2162                     self._downloader.trouble(u'ERROR: requested format not available')
2163                     return
2164                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2165
2166         results = []
2167         for format_param, video_real_url in video_url_list:
2168             # Extension
2169             video_extension = self._video_extensions.get(format_param, 'mp4')
2170
2171             results.append({
2172                 'id':       video_id.decode('utf-8'),
2173                 'url':      video_real_url.decode('utf-8'),
2174                 'uploader': video_uploader.decode('utf-8'),
2175                 'upload_date':  upload_date,
2176                 'title':    video_title,
2177                 'ext':      video_extension.decode('utf-8'),
2178                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2179                 'thumbnail':    video_thumbnail.decode('utf-8'),
2180                 'description':  video_description.decode('utf-8'),
2181             })
2182         return results
2183
2184 class BlipTVIE(InfoExtractor):
2185     """Information extractor for blip.tv"""
2186
2187     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2188     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2189     IE_NAME = u'blip.tv'
2190
2191     def report_extraction(self, file_id):
2192         """Report information extraction."""
2193         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2194
2195     def report_direct_download(self, title):
2196         """Report information extraction."""
2197         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2198
2199     def _real_extract(self, url):
2200         mobj = re.match(self._VALID_URL, url)
2201         if mobj is None:
2202             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2203             return
2204
2205         if '?' in url:
2206             cchar = '&'
2207         else:
2208             cchar = '?'
2209         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2210         request = compat_urllib_request.Request(json_url)
2211         self.report_extraction(mobj.group(1))
2212         info = None
2213         try:
2214             urlh = compat_urllib_request.urlopen(request)
2215             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2216                 basename = url.split('/')[-1]
2217                 title,ext = os.path.splitext(basename)
2218                 title = title.decode('UTF-8')
2219                 ext = ext.replace('.', '')
2220                 self.report_direct_download(title)
2221                 info = {
2222                     'id': title,
2223                     'url': url,
2224                     'uploader': None,
2225                     'upload_date': None,
2226                     'title': title,
2227                     'ext': ext,
2228                     'urlhandle': urlh
2229                 }
2230         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2232             return
2233         if info is None: # Regular URL
2234             try:
2235                 json_code_bytes = urlh.read()
2236                 json_code = json_code_bytes.decode('utf-8')
2237             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2238                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2239                 return
2240
2241             try:
2242                 json_data = json.loads(json_code)
2243                 if 'Post' in json_data:
2244                     data = json_data['Post']
2245                 else:
2246                     data = json_data
2247
2248                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2249                 video_url = data['media']['url']
2250                 umobj = re.match(self._URL_EXT, video_url)
2251                 if umobj is None:
2252                     raise ValueError('Can not determine filename extension')
2253                 ext = umobj.group(1)
2254
2255                 info = {
2256                     'id': data['item_id'],
2257                     'url': video_url,
2258                     'uploader': data['display_name'],
2259                     'upload_date': upload_date,
2260                     'title': data['title'],
2261                     'ext': ext,
2262                     'format': data['media']['mimeType'],
2263                     'thumbnail': data['thumbnailUrl'],
2264                     'description': data['description'],
2265                     'player_url': data['embedUrl']
2266                 }
2267             except (ValueError,KeyError) as err:
2268                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2269                 return
2270
2271         std_headers['User-Agent'] = 'iTunes/10.6.1'
2272         return [info]
2273
2274
2275 class MyVideoIE(InfoExtractor):
2276     """Information Extractor for myvideo.de."""
2277
2278     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2279     IE_NAME = u'myvideo'
2280
2281     def __init__(self, downloader=None):
2282         InfoExtractor.__init__(self, downloader)
2283
2284     def report_extraction(self, video_id):
2285         """Report information extraction."""
2286         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2287
2288     def _real_extract(self,url):
2289         mobj = re.match(self._VALID_URL, url)
2290         if mobj is None:
2291             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2292             return
2293
2294         video_id = mobj.group(1)
2295
2296         # Get video webpage
2297         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2298         webpage = self._download_webpage(webpage_url, video_id)
2299
2300         self.report_extraction(video_id)
2301         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2302                  webpage)
2303         if mobj is None:
2304             self._downloader.trouble(u'ERROR: unable to extract media URL')
2305             return
2306         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2307
2308         mobj = re.search('<title>([^<]+)</title>', webpage)
2309         if mobj is None:
2310             self._downloader.trouble(u'ERROR: unable to extract title')
2311             return
2312
2313         video_title = mobj.group(1)
2314
2315         return [{
2316             'id':       video_id,
2317             'url':      video_url,
2318             'uploader': None,
2319             'upload_date':  None,
2320             'title':    video_title,
2321             'ext':      u'flv',
2322         }]
2323
2324 class ComedyCentralIE(InfoExtractor):
2325     """Information extractor for The Daily Show and Colbert Report """
2326
2327     # urls can be abbreviations like :thedailyshow or :colbert
2328     # urls for episodes like:
2329     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2330     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2331     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2332     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2333                       |(https?://)?(www\.)?
2334                           (?P<showname>thedailyshow|colbertnation)\.com/
2335                          (full-episodes/(?P<episode>.*)|
2336                           (?P<clip>
2337                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2338                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2339                      $"""
2340     IE_NAME = u'comedycentral'
2341
2342     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2343
2344     _video_extensions = {
2345         '3500': 'mp4',
2346         '2200': 'mp4',
2347         '1700': 'mp4',
2348         '1200': 'mp4',
2349         '750': 'mp4',
2350         '400': 'mp4',
2351     }
2352     _video_dimensions = {
2353         '3500': '1280x720',
2354         '2200': '960x540',
2355         '1700': '768x432',
2356         '1200': '640x360',
2357         '750': '512x288',
2358         '400': '384x216',
2359     }
2360
2361     def suitable(self, url):
2362         """Receives a URL and returns True if suitable for this IE."""
2363         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2364
2365     def report_extraction(self, episode_id):
2366         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2367
2368     def report_config_download(self, episode_id):
2369         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2370
2371     def report_index_download(self, episode_id):
2372         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2373
2374     def report_player_url(self, episode_id):
2375         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2376
2377
2378     def _print_formats(self, formats):
2379         print('Available formats:')
2380         for x in formats:
2381             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2382
2383
2384     def _real_extract(self, url):
2385         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2386         if mobj is None:
2387             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2388             return
2389
2390         if mobj.group('shortname'):
2391             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2392                 url = u'http://www.thedailyshow.com/full-episodes/'
2393             else:
2394                 url = u'http://www.colbertnation.com/full-episodes/'
2395             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396             assert mobj is not None
2397
2398         if mobj.group('clip'):
2399             if mobj.group('showname') == 'thedailyshow':
2400                 epTitle = mobj.group('tdstitle')
2401             else:
2402                 epTitle = mobj.group('cntitle')
2403             dlNewest = False
2404         else:
2405             dlNewest = not mobj.group('episode')
2406             if dlNewest:
2407                 epTitle = mobj.group('showname')
2408             else:
2409                 epTitle = mobj.group('episode')
2410
2411         req = compat_urllib_request.Request(url)
2412         self.report_extraction(epTitle)
2413         try:
2414             htmlHandle = compat_urllib_request.urlopen(req)
2415             html = htmlHandle.read()
2416         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2418             return
2419         if dlNewest:
2420             url = htmlHandle.geturl()
2421             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422             if mobj is None:
2423                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2424                 return
2425             if mobj.group('episode') == '':
2426                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2427                 return
2428             epTitle = mobj.group('episode')
2429
2430         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2431
2432         if len(mMovieParams) == 0:
2433             # The Colbert Report embeds the information in a without
2434             # a URL prefix; so extract the alternate reference
2435             # and then add the URL prefix manually.
2436
2437             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2438             if len(altMovieParams) == 0:
2439                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2440                 return
2441             else:
2442                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2443
2444         playerUrl_raw = mMovieParams[0][0]
2445         self.report_player_url(epTitle)
2446         try:
2447             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2448             playerUrl = urlHandle.geturl()
2449         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2451             return
2452
2453         uri = mMovieParams[0][1]
2454         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2455         self.report_index_download(epTitle)
2456         try:
2457             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2458         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2460             return
2461
2462         results = []
2463
2464         idoc = xml.etree.ElementTree.fromstring(indexXml)
2465         itemEls = idoc.findall('.//item')
2466         for itemEl in itemEls:
2467             mediaId = itemEl.findall('./guid')[0].text
2468             shortMediaId = mediaId.split(':')[-1]
2469             showId = mediaId.split(':')[-2].replace('.com', '')
2470             officialTitle = itemEl.findall('./title')[0].text
2471             officialDate = itemEl.findall('./pubDate')[0].text
2472
2473             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2474                         compat_urllib_parse.urlencode({'uri': mediaId}))
2475             configReq = compat_urllib_request.Request(configUrl)
2476             self.report_config_download(epTitle)
2477             try:
2478                 configXml = compat_urllib_request.urlopen(configReq).read()
2479             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2481                 return
2482
2483             cdoc = xml.etree.ElementTree.fromstring(configXml)
2484             turls = []
2485             for rendition in cdoc.findall('.//rendition'):
2486                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2487                 turls.append(finfo)
2488
2489             if len(turls) == 0:
2490                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2491                 continue
2492
2493             if self._downloader.params.get('listformats', None):
2494                 self._print_formats([i[0] for i in turls])
2495                 return
2496
2497             # For now, just pick the highest bitrate
2498             format,video_url = turls[-1]
2499
2500             # Get the format arg from the arg stream
2501             req_format = self._downloader.params.get('format', None)
2502
2503             # Select format if we can find one
2504             for f,v in turls:
2505                 if f == req_format:
2506                     format, video_url = f, v
2507                     break
2508
2509             # Patch to download from alternative CDN, which does not
2510             # break on current RTMPDump builds
2511             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2512             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2513
2514             if video_url.startswith(broken_cdn):
2515                 video_url = video_url.replace(broken_cdn, better_cdn)
2516
2517             effTitle = showId + u'-' + epTitle
2518             info = {
2519                 'id': shortMediaId,
2520                 'url': video_url,
2521                 'uploader': showId,
2522                 'upload_date': officialDate,
2523                 'title': effTitle,
2524                 'ext': 'mp4',
2525                 'format': format,
2526                 'thumbnail': None,
2527                 'description': officialTitle,
2528                 'player_url': None #playerUrl
2529             }
2530
2531             results.append(info)
2532
2533         return results
2534
2535
2536 class EscapistIE(InfoExtractor):
2537     """Information extractor for The Escapist """
2538
2539     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540     IE_NAME = u'escapist'
2541
2542     def report_extraction(self, showName):
2543         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2544
2545     def report_config_download(self, showName):
2546         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2547
2548     def _real_extract(self, url):
2549         mobj = re.match(self._VALID_URL, url)
2550         if mobj is None:
2551             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2552             return
2553         showName = mobj.group('showname')
2554         videoId = mobj.group('episode')
2555
2556         self.report_extraction(showName)
2557         try:
2558             webPage = compat_urllib_request.urlopen(url)
2559             webPageBytes = webPage.read()
2560             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2561             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2562         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2564             return
2565
2566         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2567         description = unescapeHTML(descMatch.group(1))
2568         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2569         imgUrl = unescapeHTML(imgMatch.group(1))
2570         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2571         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572         configUrlMatch = re.search('config=(.*)$', playerUrl)
2573         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2574
2575         self.report_config_download(showName)
2576         try:
2577             configJSON = compat_urllib_request.urlopen(configUrl)
2578             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2579             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2580         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2582             return
2583
2584         # Technically, it's JavaScript, not JSON
2585         configJSON = configJSON.replace("'", '"')
2586
2587         try:
2588             config = json.loads(configJSON)
2589         except (ValueError,) as err:
2590             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2591             return
2592
2593         playlist = config['playlist']
2594         videoUrl = playlist[1]['url']
2595
2596         info = {
2597             'id': videoId,
2598             'url': videoUrl,
2599             'uploader': showName,
2600             'upload_date': None,
2601             'title': showName,
2602             'ext': 'flv',
2603             'thumbnail': imgUrl,
2604             'description': description,
2605             'player_url': playerUrl,
2606         }
2607
2608         return [info]
2609
2610
2611 class CollegeHumorIE(InfoExtractor):
2612     """Information extractor for collegehumor.com"""
2613
2614     _WORKING = False
2615     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616     IE_NAME = u'collegehumor'
2617
2618     def report_manifest(self, video_id):
2619         """Report information extraction."""
2620         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621
2622     def report_extraction(self, video_id):
2623         """Report information extraction."""
2624         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625
2626     def _real_extract(self, url):
2627         mobj = re.match(self._VALID_URL, url)
2628         if mobj is None:
2629             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2630             return
2631         video_id = mobj.group('videoid')
2632
2633         info = {
2634             'id': video_id,
2635             'uploader': None,
2636             'upload_date': None,
2637         }
2638
2639         self.report_extraction(video_id)
2640         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641         try:
2642             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2645             return
2646
2647         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648         try:
2649             videoNode = mdoc.findall('./video')[0]
2650             info['description'] = videoNode.findall('./description')[0].text
2651             info['title'] = videoNode.findall('./caption')[0].text
2652             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653             manifest_url = videoNode.findall('./file')[0].text
2654         except IndexError:
2655             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2656             return
2657
2658         manifest_url += '?hdcore=2.10.3'
2659         self.report_manifest(video_id)
2660         try:
2661             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2664             return
2665
2666         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667         try:
2668             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669             node_id = media_node.attrib['url']
2670             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671         except IndexError as err:
2672             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2673             return
2674
2675         url_pr = compat_urllib_parse_urlparse(manifest_url)
2676         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2677
2678         info['url'] = url
2679         info['ext'] = 'f4f'
2680         return [info]
2681
2682
2683 class XVideosIE(InfoExtractor):
2684     """Information extractor for xvideos.com"""
2685
2686     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687     IE_NAME = u'xvideos'
2688
2689     def report_extraction(self, video_id):
2690         """Report information extraction."""
2691         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2692
2693     def _real_extract(self, url):
2694         mobj = re.match(self._VALID_URL, url)
2695         if mobj is None:
2696             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2697             return
2698         video_id = mobj.group(1)
2699
2700         webpage = self._download_webpage(url, video_id)
2701
2702         self.report_extraction(video_id)
2703
2704
2705         # Extract video URL
2706         mobj = re.search(r'flv_url=(.+?)&', webpage)
2707         if mobj is None:
2708             self._downloader.trouble(u'ERROR: unable to extract video url')
2709             return
2710         video_url = compat_urllib_parse.unquote(mobj.group(1))
2711
2712
2713         # Extract title
2714         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2715         if mobj is None:
2716             self._downloader.trouble(u'ERROR: unable to extract video title')
2717             return
2718         video_title = mobj.group(1)
2719
2720
2721         # Extract video thumbnail
2722         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2723         if mobj is None:
2724             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2725             return
2726         video_thumbnail = mobj.group(0)
2727
2728         info = {
2729             'id': video_id,
2730             'url': video_url,
2731             'uploader': None,
2732             'upload_date': None,
2733             'title': video_title,
2734             'ext': 'flv',
2735             'thumbnail': video_thumbnail,
2736             'description': None,
2737         }
2738
2739         return [info]
2740
2741
2742 class SoundcloudIE(InfoExtractor):
2743     """Information extractor for soundcloud.com
2744        To access the media, the uid of the song and a stream token
2745        must be extracted from the page source and the script must make
2746        a request to media.soundcloud.com/crossdomain.xml. Then
2747        the media can be grabbed by requesting from an url composed
2748        of the stream token and uid
2749      """
2750
2751     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752     IE_NAME = u'soundcloud'
2753
2754     def __init__(self, downloader=None):
2755         InfoExtractor.__init__(self, downloader)
2756
2757     def report_resolve(self, video_id):
2758         """Report information extraction."""
2759         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2760
2761     def report_extraction(self, video_id):
2762         """Report information extraction."""
2763         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2764
2765     def _real_extract(self, url):
2766         mobj = re.match(self._VALID_URL, url)
2767         if mobj is None:
2768             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2769             return
2770
2771         # extract uploader (which is in the url)
2772         uploader = mobj.group(1)
2773         # extract simple title (uploader + slug of song title)
2774         slug_title =  mobj.group(2)
2775         simple_title = uploader + u'-' + slug_title
2776
2777         self.report_resolve('%s/%s' % (uploader, slug_title))
2778
2779         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2780         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2781         request = compat_urllib_request.Request(resolv_url)
2782         try:
2783             info_json_bytes = compat_urllib_request.urlopen(request).read()
2784             info_json = info_json_bytes.decode('utf-8')
2785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2786             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2787             return
2788
2789         info = json.loads(info_json)
2790         video_id = info['id']
2791         self.report_extraction('%s/%s' % (uploader, slug_title))
2792
2793         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794         request = compat_urllib_request.Request(streams_url)
2795         try:
2796             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2797             stream_json = stream_json_bytes.decode('utf-8')
2798         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2800             return
2801
2802         streams = json.loads(stream_json)
2803         mediaURL = streams['http_mp3_128_url']
2804
2805         return [{
2806             'id':       info['id'],
2807             'url':      mediaURL,
2808             'uploader': info['user']['username'],
2809             'upload_date':  info['created_at'],
2810             'title':    info['title'],
2811             'ext':      u'mp3',
2812             'description': info['description'],
2813         }]
2814
2815
2816 class InfoQIE(InfoExtractor):
2817     """Information extractor for infoq.com"""
2818     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2819
2820     def report_extraction(self, video_id):
2821         """Report information extraction."""
2822         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2823
2824     def _real_extract(self, url):
2825         mobj = re.match(self._VALID_URL, url)
2826         if mobj is None:
2827             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2828             return
2829
2830         webpage = self._download_webpage(url, video_id=url)
2831         self.report_extraction(url)
2832
2833         # Extract video URL
2834         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2835         if mobj is None:
2836             self._downloader.trouble(u'ERROR: unable to extract video url')
2837             return
2838         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2839         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2840
2841         # Extract title
2842         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2843         if mobj is None:
2844             self._downloader.trouble(u'ERROR: unable to extract video title')
2845             return
2846         video_title = mobj.group(1)
2847
2848         # Extract description
2849         video_description = u'No description available.'
2850         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2851         if mobj is not None:
2852             video_description = mobj.group(1)
2853
2854         video_filename = video_url.split('/')[-1]
2855         video_id, extension = video_filename.split('.')
2856
2857         info = {
2858             'id': video_id,
2859             'url': video_url,
2860             'uploader': None,
2861             'upload_date': None,
2862             'title': video_title,
2863             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2864             'thumbnail': None,
2865             'description': video_description,
2866         }
2867
2868         return [info]
2869
2870 class MixcloudIE(InfoExtractor):
2871     """Information extractor for www.mixcloud.com"""
2872
2873     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2874     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2875     IE_NAME = u'mixcloud'
2876
2877     def __init__(self, downloader=None):
2878         InfoExtractor.__init__(self, downloader)
2879
2880     def report_download_json(self, file_id):
2881         """Report JSON download."""
2882         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2883
2884     def report_extraction(self, file_id):
2885         """Report information extraction."""
2886         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2887
2888     def get_urls(self, jsonData, fmt, bitrate='best'):
2889         """Get urls from 'audio_formats' section in json"""
2890         file_url = None
2891         try:
2892             bitrate_list = jsonData[fmt]
2893             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2894                 bitrate = max(bitrate_list) # select highest
2895
2896             url_list = jsonData[fmt][bitrate]
2897         except TypeError: # we have no bitrate info.
2898             url_list = jsonData[fmt]
2899         return url_list
2900
2901     def check_urls(self, url_list):
2902         """Returns 1st active url from list"""
2903         for url in url_list:
2904             try:
2905                 compat_urllib_request.urlopen(url)
2906                 return url
2907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2908                 url = None
2909
2910         return None
2911
2912     def _print_formats(self, formats):
2913         print('Available formats:')
2914         for fmt in formats.keys():
2915             for b in formats[fmt]:
2916                 try:
2917                     ext = formats[fmt][b][0]
2918                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2919                 except TypeError: # we have no bitrate info
2920                     ext = formats[fmt][0]
2921                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2922                     break
2923
2924     def _real_extract(self, url):
2925         mobj = re.match(self._VALID_URL, url)
2926         if mobj is None:
2927             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2928             return
2929         # extract uploader & filename from url
2930         uploader = mobj.group(1).decode('utf-8')
2931         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2932
2933         # construct API request
2934         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2935         # retrieve .json file with links to files
2936         request = compat_urllib_request.Request(file_url)
2937         try:
2938             self.report_download_json(file_url)
2939             jsonData = compat_urllib_request.urlopen(request).read()
2940         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2941             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2942             return
2943
2944         # parse JSON
2945         json_data = json.loads(jsonData)
2946         player_url = json_data['player_swf_url']
2947         formats = dict(json_data['audio_formats'])
2948
2949         req_format = self._downloader.params.get('format', None)
2950         bitrate = None
2951
2952         if self._downloader.params.get('listformats', None):
2953             self._print_formats(formats)
2954             return
2955
2956         if req_format is None or req_format == 'best':
2957             for format_param in formats.keys():
2958                 url_list = self.get_urls(formats, format_param)
2959                 # check urls
2960                 file_url = self.check_urls(url_list)
2961                 if file_url is not None:
2962                     break # got it!
2963         else:
2964             if req_format not in formats:
2965                 self._downloader.trouble(u'ERROR: format is not available')
2966                 return
2967
2968             url_list = self.get_urls(formats, req_format)
2969             file_url = self.check_urls(url_list)
2970             format_param = req_format
2971
2972         return [{
2973             'id': file_id.decode('utf-8'),
2974             'url': file_url.decode('utf-8'),
2975             'uploader': uploader.decode('utf-8'),
2976             'upload_date': None,
2977             'title': json_data['name'],
2978             'ext': file_url.split('.')[-1].decode('utf-8'),
2979             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2980             'thumbnail': json_data['thumbnail_url'],
2981             'description': json_data['description'],
2982             'player_url': player_url.decode('utf-8'),
2983         }]
2984
2985 class StanfordOpenClassroomIE(InfoExtractor):
2986     """Information extractor for Stanford's Open ClassRoom"""
2987
2988     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2989     IE_NAME = u'stanfordoc'
2990
2991     def report_download_webpage(self, objid):
2992         """Report information extraction."""
2993         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2994
2995     def report_extraction(self, video_id):
2996         """Report information extraction."""
2997         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2998
2999     def _real_extract(self, url):
3000         mobj = re.match(self._VALID_URL, url)
3001         if mobj is None:
3002             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3003             return
3004
3005         if mobj.group('course') and mobj.group('video'): # A specific video
3006             course = mobj.group('course')
3007             video = mobj.group('video')
3008             info = {
3009                 'id': course + '_' + video,
3010                 'uploader': None,
3011                 'upload_date': None,
3012             }
3013
3014             self.report_extraction(info['id'])
3015             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3016             xmlUrl = baseUrl + video + '.xml'
3017             try:
3018                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3019             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3021                 return
3022             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3023             try:
3024                 info['title'] = mdoc.findall('./title')[0].text
3025                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3026             except IndexError:
3027                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3028                 return
3029             info['ext'] = info['url'].rpartition('.')[2]
3030             return [info]
3031         elif mobj.group('course'): # A course page
3032             course = mobj.group('course')
3033             info = {
3034                 'id': course,
3035                 'type': 'playlist',
3036                 'uploader': None,
3037                 'upload_date': None,
3038             }
3039
3040             self.report_download_webpage(info['id'])
3041             try:
3042                 coursepage = compat_urllib_request.urlopen(url).read()
3043             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3044                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3045                 return
3046
3047             m = re.search('<h1>([^<]+)</h1>', coursepage)
3048             if m:
3049                 info['title'] = unescapeHTML(m.group(1))
3050             else:
3051                 info['title'] = info['id']
3052
3053             m = re.search('<description>([^<]+)</description>', coursepage)
3054             if m:
3055                 info['description'] = unescapeHTML(m.group(1))
3056
3057             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3058             info['list'] = [
3059                 {
3060                     'type': 'reference',
3061                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3062                 }
3063                     for vpage in links]
3064             results = []
3065             for entry in info['list']:
3066                 assert entry['type'] == 'reference'
3067                 results += self.extract(entry['url'])
3068             return results
3069
3070         else: # Root page
3071             info = {
3072                 'id': 'Stanford OpenClassroom',
3073                 'type': 'playlist',
3074                 'uploader': None,
3075                 'upload_date': None,
3076             }
3077
3078             self.report_download_webpage(info['id'])
3079             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3080             try:
3081                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3082             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3083                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3084                 return
3085
3086             info['title'] = info['id']
3087
3088             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3089             info['list'] = [
3090                 {
3091                     'type': 'reference',
3092                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3093                 }
3094                     for cpage in links]
3095
3096             results = []
3097             for entry in info['list']:
3098                 assert entry['type'] == 'reference'
3099                 results += self.extract(entry['url'])
3100             return results
3101
3102 class MTVIE(InfoExtractor):
3103     """Information extractor for MTV.com"""
3104
3105     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3106     IE_NAME = u'mtv'
3107
3108     def report_extraction(self, video_id):
3109         """Report information extraction."""
3110         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3111
3112     def _real_extract(self, url):
3113         mobj = re.match(self._VALID_URL, url)
3114         if mobj is None:
3115             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3116             return
3117         if not mobj.group('proto'):
3118             url = 'http://' + url
3119         video_id = mobj.group('videoid')
3120
3121         webpage = self._download_webpage(url, video_id)
3122
3123         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3124         if mobj is None:
3125             self._downloader.trouble(u'ERROR: unable to extract song name')
3126             return
3127         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3128         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3129         if mobj is None:
3130             self._downloader.trouble(u'ERROR: unable to extract performer')
3131             return
3132         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3133         video_title = performer + ' - ' + song_name
3134
3135         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3136         if mobj is None:
3137             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3138             return
3139         mtvn_uri = mobj.group(1)
3140
3141         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3142         if mobj is None:
3143             self._downloader.trouble(u'ERROR: unable to extract content id')
3144             return
3145         content_id = mobj.group(1)
3146
3147         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3148         self.report_extraction(video_id)
3149         request = compat_urllib_request.Request(videogen_url)
3150         try:
3151             metadataXml = compat_urllib_request.urlopen(request).read()
3152         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3153             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3154             return
3155
3156         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3157         renditions = mdoc.findall('.//rendition')
3158
3159         # For now, always pick the highest quality.
3160         rendition = renditions[-1]
3161
3162         try:
3163             _,_,ext = rendition.attrib['type'].partition('/')
3164             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3165             video_url = rendition.find('./src').text
3166         except KeyError:
3167             self._downloader.trouble('Invalid rendition field.')
3168             return
3169
3170         info = {
3171             'id': video_id,
3172             'url': video_url,
3173             'uploader': performer,
3174             'upload_date': None,
3175             'title': video_title,
3176             'ext': ext,
3177             'format': format,
3178         }
3179
3180         return [info]
3181
3182
3183 class YoukuIE(InfoExtractor):
3184     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3185
3186     def report_download_webpage(self, file_id):
3187         """Report webpage download."""
3188         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3189
3190     def report_extraction(self, file_id):
3191         """Report information extraction."""
3192         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3193
3194     def _gen_sid(self):
3195         nowTime = int(time.time() * 1000)
3196         random1 = random.randint(1000,1998)
3197         random2 = random.randint(1000,9999)
3198
3199         return "%d%d%d" %(nowTime,random1,random2)
3200
3201     def _get_file_ID_mix_string(self, seed):
3202         mixed = []
3203         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3204         seed = float(seed)
3205         for i in range(len(source)):
3206             seed  =  (seed * 211 + 30031 ) % 65536
3207             index  =  math.floor(seed / 65536 * len(source) )
3208             mixed.append(source[int(index)])
3209             source.remove(source[int(index)])
3210         #return ''.join(mixed)
3211         return mixed
3212
3213     def _get_file_id(self, fileId, seed):
3214         mixed = self._get_file_ID_mix_string(seed)
3215         ids = fileId.split('*')
3216         realId = []
3217         for ch in ids:
3218             if ch:
3219                 realId.append(mixed[int(ch)])
3220         return ''.join(realId)
3221
3222     def _real_extract(self, url):
3223         mobj = re.match(self._VALID_URL, url)
3224         if mobj is None:
3225             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3226             return
3227         video_id = mobj.group('ID')
3228
3229         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3230
3231         request = compat_urllib_request.Request(info_url, None, std_headers)
3232         try:
3233             self.report_download_webpage(video_id)
3234             jsondata = compat_urllib_request.urlopen(request).read()
3235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3237             return
3238
3239         self.report_extraction(video_id)
3240         try:
3241             jsonstr = jsondata.decode('utf-8')
3242             config = json.loads(jsonstr)
3243
3244             video_title =  config['data'][0]['title']
3245             seed = config['data'][0]['seed']
3246
3247             format = self._downloader.params.get('format', None)
3248             supported_format = list(config['data'][0]['streamfileids'].keys())
3249
3250             if format is None or format == 'best':
3251                 if 'hd2' in supported_format:
3252                     format = 'hd2'
3253                 else:
3254                     format = 'flv'
3255                 ext = u'flv'
3256             elif format == 'worst':
3257                 format = 'mp4'
3258                 ext = u'mp4'
3259             else:
3260                 format = 'flv'
3261                 ext = u'flv'
3262
3263
3264             fileid = config['data'][0]['streamfileids'][format]
3265             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3266         except (UnicodeDecodeError, ValueError, KeyError):
3267             self._downloader.trouble(u'ERROR: unable to extract info section')
3268             return
3269
3270         files_info=[]
3271         sid = self._gen_sid()
3272         fileid = self._get_file_id(fileid, seed)
3273
3274         #column 8,9 of fileid represent the segment number
3275         #fileid[7:9] should be changed
3276         for index, key in enumerate(keys):
3277
3278             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3279             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3280
3281             info = {
3282                 'id': '%s_part%02d' % (video_id, index),
3283                 'url': download_url,
3284                 'uploader': None,
3285                 'upload_date': None,
3286                 'title': video_title,
3287                 'ext': ext,
3288             }
3289             files_info.append(info)
3290
3291         return files_info
3292
3293
3294 class XNXXIE(InfoExtractor):
3295     """Information extractor for xnxx.com"""
3296
3297     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3298     IE_NAME = u'xnxx'
3299     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3300     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3301     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3302
3303     def report_webpage(self, video_id):
3304         """Report information extraction"""
3305         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3306
3307     def report_extraction(self, video_id):
3308         """Report information extraction"""
3309         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3310
3311     def _real_extract(self, url):
3312         mobj = re.match(self._VALID_URL, url)
3313         if mobj is None:
3314             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3315             return
3316         video_id = mobj.group(1)
3317
3318         self.report_webpage(video_id)
3319
3320         # Get webpage content
3321         try:
3322             webpage_bytes = compat_urllib_request.urlopen(url).read()
3323             webpage = webpage_bytes.decode('utf-8')
3324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3325             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3326             return
3327
3328         result = re.search(self.VIDEO_URL_RE, webpage)
3329         if result is None:
3330             self._downloader.trouble(u'ERROR: unable to extract video url')
3331             return
3332         video_url = compat_urllib_parse.unquote(result.group(1))
3333
3334         result = re.search(self.VIDEO_TITLE_RE, webpage)
3335         if result is None:
3336             self._downloader.trouble(u'ERROR: unable to extract video title')
3337             return
3338         video_title = result.group(1)
3339
3340         result = re.search(self.VIDEO_THUMB_RE, webpage)
3341         if result is None:
3342             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3343             return
3344         video_thumbnail = result.group(1)
3345
3346         return [{
3347             'id': video_id,
3348             'url': video_url,
3349             'uploader': None,
3350             'upload_date': None,
3351             'title': video_title,
3352             'ext': 'flv',
3353             'thumbnail': video_thumbnail,
3354             'description': None,
3355         }]
3356
3357
3358 class GooglePlusIE(InfoExtractor):
3359     """Information extractor for plus.google.com."""
3360
3361     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3362     IE_NAME = u'plus.google'
3363
3364     def __init__(self, downloader=None):
3365         InfoExtractor.__init__(self, downloader)
3366
3367     def report_extract_entry(self, url):
3368         """Report downloading extry"""
3369         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3370
3371     def report_date(self, upload_date):
3372         """Report downloading extry"""
3373         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3374
3375     def report_uploader(self, uploader):
3376         """Report downloading extry"""
3377         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3378
3379     def report_title(self, video_title):
3380         """Report downloading extry"""
3381         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3382
3383     def report_extract_vid_page(self, video_page):
3384         """Report information extraction."""
3385         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3386
3387     def _real_extract(self, url):
3388         # Extract id from URL
3389         mobj = re.match(self._VALID_URL, url)
3390         if mobj is None:
3391             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3392             return
3393
3394         post_url = mobj.group(0)
3395         video_id = mobj.group(1)
3396
3397         video_extension = 'flv'
3398
3399         # Step 1, Retrieve post webpage to extract further information
3400         self.report_extract_entry(post_url)
3401         request = compat_urllib_request.Request(post_url)
3402         try:
3403             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3404         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3405             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3406             return
3407
3408         # Extract update date
3409         upload_date = None
3410         pattern = 'title="Timestamp">(.*?)</a>'
3411         mobj = re.search(pattern, webpage)
3412         if mobj:
3413             upload_date = mobj.group(1)
3414             # Convert timestring to a format suitable for filename
3415             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3416             upload_date = upload_date.strftime('%Y%m%d')
3417         self.report_date(upload_date)
3418
3419         # Extract uploader
3420         uploader = None
3421         pattern = r'rel\="author".*?>(.*?)</a>'
3422         mobj = re.search(pattern, webpage)
3423         if mobj:
3424             uploader = mobj.group(1)
3425         self.report_uploader(uploader)
3426
3427         # Extract title
3428         # Get the first line for title
3429         video_title = u'NA'
3430         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3431         mobj = re.search(pattern, webpage)
3432         if mobj:
3433             video_title = mobj.group(1)
3434         self.report_title(video_title)
3435
3436         # Step 2, Stimulate clicking the image box to launch video
3437         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3438         mobj = re.search(pattern, webpage)
3439         if mobj is None:
3440             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3441
3442         video_page = mobj.group(1)
3443         request = compat_urllib_request.Request(video_page)
3444         try:
3445             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3447             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3448             return
3449         self.report_extract_vid_page(video_page)
3450
3451
3452         # Extract video links on video page
3453         """Extract video links of all sizes"""
3454         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3455         mobj = re.findall(pattern, webpage)
3456         if len(mobj) == 0:
3457             self._downloader.trouble(u'ERROR: unable to extract video links')
3458
3459         # Sort in resolution
3460         links = sorted(mobj)
3461
3462         # Choose the lowest of the sort, i.e. highest resolution
3463         video_url = links[-1]
3464         # Only get the url. The resolution part in the tuple has no use anymore
3465         video_url = video_url[-1]
3466         # Treat escaped \u0026 style hex
3467         try:
3468             video_url = video_url.decode("unicode_escape")
3469         except AttributeError: # Python 3
3470             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3471
3472
3473         return [{
3474             'id':       video_id,
3475             'url':      video_url,
3476             'uploader': uploader,
3477             'upload_date':  upload_date,
3478             'title':    video_title,
3479             'ext':      video_extension,
3480         }]
3481
3482 class NBAIE(InfoExtractor):
3483     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3484     IE_NAME = u'nba'
3485
3486     def _real_extract(self, url):
3487         mobj = re.match(self._VALID_URL, url)
3488         if mobj is None:
3489             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3490             return
3491
3492         video_id = mobj.group(1)
3493         if video_id.endswith('/index.html'):
3494             video_id = video_id[:-len('/index.html')]
3495
3496         webpage = self._download_webpage(url, video_id)
3497
3498         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3499         def _findProp(rexp, default=None):
3500             m = re.search(rexp, webpage)
3501             if m:
3502                 return unescapeHTML(m.group(1))
3503             else:
3504                 return default
3505
3506         shortened_video_id = video_id.rpartition('/')[2]
3507         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3508         info = {
3509             'id': shortened_video_id,
3510             'url': video_url,
3511             'ext': 'mp4',
3512             'title': title,
3513             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3514             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3515         }
3516         return [info]
3517
3518 class JustinTVIE(InfoExtractor):
3519     """Information extractor for justin.tv and twitch.tv"""
3520     # TODO: One broadcast may be split into multiple videos. The key
3521     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3522     # starts at 1 and increases. Can we treat all parts as one video?
3523
3524     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3525         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3526     _JUSTIN_PAGE_LIMIT = 100
3527     IE_NAME = u'justin.tv'
3528
3529     def report_extraction(self, file_id):
3530         """Report information extraction."""
3531         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3532
3533     def report_download_page(self, channel, offset):
3534         """Report attempt to download a single page of videos."""
3535         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3536                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3537
3538     # Return count of items, list of *valid* items
3539     def _parse_page(self, url):
3540         try:
3541             urlh = compat_urllib_request.urlopen(url)
3542             webpage_bytes = urlh.read()
3543             webpage = webpage_bytes.decode('utf-8', 'ignore')
3544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3545             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3546             return
3547
3548         response = json.loads(webpage)
3549         info = []
3550         for clip in response:
3551             video_url = clip['video_file_url']
3552             if video_url:
3553                 video_extension = os.path.splitext(video_url)[1][1:]
3554                 video_date = re.sub('-', '', clip['created_on'][:10])
3555                 info.append({
3556                     'id': clip['id'],
3557                     'url': video_url,
3558                     'title': clip['title'],
3559                     'uploader': clip.get('user_id', clip.get('channel_id')),
3560                     'upload_date': video_date,
3561                     'ext': video_extension,
3562                 })
3563         return (len(response), info)
3564
3565     def _real_extract(self, url):
3566         mobj = re.match(self._VALID_URL, url)
3567         if mobj is None:
3568             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3569             return
3570
3571         api = 'http://api.justin.tv'
3572         video_id = mobj.group(mobj.lastindex)
3573         paged = False
3574         if mobj.lastindex == 1:
3575             paged = True
3576             api += '/channel/archives/%s.json'
3577         else:
3578             api += '/clip/show/%s.json'
3579         api = api % (video_id,)
3580
3581         self.report_extraction(video_id)
3582
3583         info = []
3584         offset = 0
3585         limit = self._JUSTIN_PAGE_LIMIT
3586         while True:
3587             if paged:
3588                 self.report_download_page(video_id, offset)
3589             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3590             page_count, page_info = self._parse_page(page_url)
3591             info.extend(page_info)
3592             if not paged or page_count != limit:
3593                 break
3594             offset += limit
3595         return info
3596
3597 class FunnyOrDieIE(InfoExtractor):
3598     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3599
3600     def _real_extract(self, url):
3601         mobj = re.match(self._VALID_URL, url)
3602         if mobj is None:
3603             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3604             return
3605
3606         video_id = mobj.group('id')
3607         webpage = self._download_webpage(url, video_id)
3608
3609         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3610         if not m:
3611             self._downloader.trouble(u'ERROR: unable to find video information')
3612         video_url = unescapeHTML(m.group('url'))
3613
3614         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3615         if not m:
3616             self._downloader.trouble(u'Cannot find video title')
3617         title = unescapeHTML(m.group('title'))
3618
3619         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3620         if m:
3621             desc = unescapeHTML(m.group('desc'))
3622         else:
3623             desc = None
3624
3625         info = {
3626             'id': video_id,
3627             'url': video_url,
3628             'ext': 'mp4',
3629             'title': title,
3630             'description': desc,
3631         }
3632         return [info]
3633
3634 class TweetReelIE(InfoExtractor):
3635     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3636
3637     def _real_extract(self, url):
3638         mobj = re.match(self._VALID_URL, url)
3639         if mobj is None:
3640             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3641             return
3642
3643         video_id = mobj.group('id')
3644         webpage = self._download_webpage(url, video_id)
3645
3646         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3647         if not m:
3648             self._downloader.trouble(u'ERROR: Cannot find status ID')
3649         status_id = m.group(1)
3650
3651         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3652         if not m:
3653             self._downloader.trouble(u'WARNING: Cannot find description')
3654         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3655
3656         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3657         if not m:
3658             self._downloader.trouble(u'ERROR: Cannot find uploader')
3659         uploader = unescapeHTML(m.group('uploader'))
3660         uploader_id = unescapeHTML(m.group('uploader_id'))
3661
3662         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3663         if not m:
3664             self._downloader.trouble(u'ERROR: Cannot find upload date')
3665         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3666
3667         title = desc
3668         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3669
3670         info = {
3671             'id': video_id,
3672             'url': video_url,
3673             'ext': 'mov',
3674             'title': title,
3675             'description': desc,
3676             'uploader': uploader,
3677             'uploader_id': uploader_id,
3678             'internal_id': status_id,
3679             'upload_date': upload_date
3680         }
3681         return [info]
3682         
3683 class SteamIE(InfoExtractor):
3684     _VALID_URL = r"""http://store.steampowered.com/ 
3685                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3686                 (?P<gameID>\d+)/?
3687                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3688                 """
3689
3690     def suitable(self, url):
3691         """Receives a URL and returns True if suitable for this IE."""
3692         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3693
3694     def _real_extract(self, url):
3695         m = re.match(self._VALID_URL, url, re.VERBOSE)
3696         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3697         gameID = m.group('gameID')
3698         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3699         webpage = self._download_webpage(videourl, gameID)
3700         mweb = re.finditer(urlRE, webpage)
3701         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3702         titles = re.finditer(namesRE, webpage)
3703         videos = []
3704         for vid,vtitle in zip(mweb,titles):
3705             video_id = vid.group('videoID')
3706             title = vtitle.group('videoName')
3707             video_url = vid.group('videoURL')
3708             if not video_url:
3709                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3710             info = {
3711                 'id':video_id,
3712                 'url':video_url,
3713                 'ext': 'flv',
3714                 'title': unescapeHTML(title)
3715                   }
3716             videos.append(info)
3717         return videos
3718         
3719 class UstreamIE(InfoExtractor):
3720     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3721     IE_NAME = u'ustream'
3722     
3723     def _real_extract(self, url):
3724         m = re.match(self._VALID_URL, url)
3725         video_id = m.group('videoID')
3726         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3727         webpage = self._download_webpage(url, video_id)
3728         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3729         title = m.group('title')
3730         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3731         uploader = m.group('uploader')
3732         info = {
3733                 'id':video_id,
3734                 'url':video_url,
3735                 'ext': 'flv',
3736                 'title': title,
3737                 'uploader': uploader
3738                   }
3739         return [info]
3740
3741
3742
3743 class YouPornIE(InfoExtractor):
3744     """Information extractor for youporn.com."""
3745
3746     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3747     IE_NAME = u'youporn'
3748     VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>'
3749     VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>'
3750     VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>'
3751     DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3752     LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3753
3754     def __init__(self, downloader=None):
3755         InfoExtractor.__init__(self, downloader)
3756
3757     def report_id(self, video_id):
3758         """Report finding video ID"""
3759         self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id)
3760
3761     def report_webpage(self, url):
3762         """Report downloading page"""
3763         self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url)
3764
3765     def report_title(self, video_title):
3766         """Report dfinding title"""
3767         self._downloader.to_screen(u'[youporn] Title: %s' % video_title)
3768     
3769     def report_uploader(self, uploader):
3770         """Report dfinding title"""
3771         self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader)
3772
3773     def report_upload_date(self, video_date):
3774         """Report finding date"""
3775         self._downloader.to_screen(u'[youporn] Date: %s' % video_date)
3776
3777     def _print_formats(self, formats):
3778         """Print all available formats"""
3779         print 'Available formats:'
3780         print u'ext\t\tformat'
3781         print u'---------------------------------'
3782         for format in formats:
3783             print u'%s\t\t%s'  % (format['ext'], format['format'])
3784
3785     def _specific(self, req_format, formats):
3786         for x in formats:
3787             if(x["format"]==req_format):
3788                 return x
3789         return None
3790
3791
3792     def _real_extract(self, url):
3793         mobj = re.match(self._VALID_URL, url)
3794         if mobj is None:
3795             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3796             return
3797
3798         video_id = mobj.group('videoid').decode('utf-8')
3799         self.report_id(video_id)
3800
3801         # Get webpage content
3802         try:
3803             webpage = urllib2.urlopen(url).read()
3804         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3805             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3806             return
3807         self.report_webpage(url)
3808
3809         # Get the video title
3810         result = re.search(self.VIDEO_TITLE_RE, webpage)
3811         if result is None:
3812             self._downloader.trouble(u'ERROR: unable to extract video title')
3813             return
3814         video_title = result.group('title').decode('utf-8').strip()
3815         self.report_title(video_title)
3816
3817         # Get the video date
3818         result = re.search(self.VIDEO_DATE_RE, webpage)
3819         if result is None:
3820             self._downloader.trouble(u'ERROR: unable to extract video date')
3821             return
3822         upload_date = result.group('date').decode('utf-8').strip()
3823         self.report_upload_date(upload_date)
3824
3825         # Get the video uploader
3826         result = re.search(self.VIDEO_UPLOADER_RE, webpage)
3827         if result is None:
3828             self._downloader.trouble(u'ERROR: unable to extract uploader')
3829             return
3830         video_uploader = result.group('uploader').decode('utf-8').strip()
3831         video_uploader = clean_html( video_uploader )
3832         self.report_uploader(video_uploader)
3833
3834         # Get all of the formats available
3835         result = re.search(self.DOWNLOAD_LIST_RE, webpage)
3836         if result is None:
3837             self._downloader.trouble(u'ERROR: unable to extract download list')
3838             return
3839         download_list_html = result.group('download_list').decode('utf-8').strip()
3840
3841         # Get all of the links from the page
3842         links = re.findall(self.LINK_RE, download_list_html)
3843         if(len(links) == 0):
3844             self._downloader.trouble(u'ERROR: no known formats available for video')
3845             return
3846         
3847         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
3848
3849         formats = []
3850         for link in links:
3851
3852             # A link looks like this:
3853             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3854             # A path looks like this:
3855             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3856             video_url = unescapeHTML( link.decode('utf-8') )
3857             path = urlparse( video_url ).path
3858             extension = os.path.splitext( path )[1][1:]
3859             format = path.split('/')[4].split('_')[:2]
3860             size = format[0]
3861             bitrate = format[1]
3862             format = "-".join( format )
3863             title = u'%s-%s-%s' % (video_title, size, bitrate)
3864
3865             formats.append({
3866                 'id': video_id,
3867                 'url': video_url,
3868                 'uploader': video_uploader,
3869                 'upload_date': upload_date,
3870                 'title': title,
3871                 'ext': extension,
3872                 'format': format,
3873                 'thumbnail': None,
3874                 'description': None,
3875                 'player_url': None
3876             })
3877
3878         if self._downloader.params.get('listformats', None):
3879             self._print_formats(formats)
3880             return
3881
3882         req_format = self._downloader.params.get('format', None)
3883         #format_limit = self._downloader.params.get('format_limit', None)
3884         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3885
3886
3887         if req_format is None or req_format == 'best':
3888             return [formats[0]]
3889         elif req_format == 'worst':
3890             return [formats[-1]]
3891         elif req_format in ('-1', 'all'):
3892             return formats
3893         else:
3894             format = self._specific( req_format, formats )
3895             if result is None:
3896                 self._downloader.trouble(u'ERROR: requested format not available')
3897                 return
3898             return [format]
3899
3900         
3901
3902 class PornotubeIE(InfoExtractor):
3903     """Information extractor for pornotube.com."""
3904
3905     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3906     IE_NAME = u'pornotube'
3907     VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3908     VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3909
3910
3911     def __init__(self, downloader=None):
3912         InfoExtractor.__init__(self, downloader)
3913
3914     def report_extract_entry(self, url):
3915         """Report downloading extry"""
3916         self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8'))
3917
3918     def report_date(self, upload_date):
3919         """Report finding uploaded date"""
3920         self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date)
3921
3922     def report_webpage(self, url):
3923         """Report downloading page"""
3924         self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url)
3925
3926     def report_title(self, video_title):
3927         """Report downloading extry"""
3928         self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8'))
3929
3930     def _real_extract(self, url):
3931         mobj = re.match(self._VALID_URL, url)
3932         if mobj is None:
3933             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3934             return
3935
3936         video_id = mobj.group('videoid').decode('utf-8')
3937         video_title = mobj.group('title').decode('utf-8')
3938         self.report_title(video_title);
3939
3940         # Get webpage content
3941         try:
3942             webpage = urllib2.urlopen(url).read()
3943         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3944             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3945             return
3946         self.report_webpage(url)
3947
3948         # Get the video URL
3949         result = re.search(self.VIDEO_URL_RE, webpage)
3950         if result is None:
3951             self._downloader.trouble(u'ERROR: unable to extract video url')
3952             return
3953         video_url = urllib.unquote(result.group('url').decode('utf-8'))
3954         self.report_extract_entry(video_url)
3955
3956         #Get the uploaded date
3957         result = re.search(self.VIDEO_UPLOADED_RE, webpage)
3958         if result is None:
3959             self._downloader.trouble(u'ERROR: unable to extract video title')
3960             return
3961         upload_date = result.group('date').decode('utf-8')
3962         self.report_date(upload_date);
3963
3964
3965         info = {'id': video_id,
3966                 'url': video_url,
3967                 'uploader': None,
3968                 'upload_date': upload_date,
3969                 'title': video_title,
3970                 'ext': 'flv',
3971                 'format': 'flv',
3972                 'thumbnail': None,
3973                 'description': None,
3974                 'player_url': None}
3975
3976         return [info]
3977
3978
3979
3980 class YouJizzIE(InfoExtractor):
3981     """Information extractor for youjizz.com."""
3982
3983     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/([^.]+).html$'
3984     IE_NAME = u'youjizz'
3985     VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
3986     EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
3987     SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
3988
3989     def __init__(self, downloader=None):
3990         InfoExtractor.__init__(self, downloader)
3991
3992     def report_extract_entry(self, url):
3993         """Report downloading extry"""
3994         self._downloader.to_screen(u'[youjizz] Downloading entry: %s' % url.decode('utf-8'))
3995
3996     def report_webpage(self, url):
3997         """Report downloading page"""
3998         self._downloader.to_screen(u'[youjizz] Downloaded page: %s' % url)
3999
4000     def report_title(self, video_title):
4001         """Report downloading extry"""
4002         self._downloader.to_screen(u'[youjizz] Title: %s' % video_title.decode('utf-8'))
4003
4004     def report_embed_page(self, embed_page):
4005         """Report downloading extry"""
4006         self._downloader.to_screen(u'[youjizz] Embed Page: %s' % embed_page.decode('utf-8'))
4007
4008     def _real_extract(self, url):
4009         # Get webpage content
4010         try:
4011             webpage = urllib2.urlopen(url).read()
4012         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4013             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
4014             return
4015         self.report_webpage(url)
4016
4017         # Get the video title
4018         result = re.search(self.VIDEO_TITLE_RE, webpage)
4019         if result is None:
4020             self._downloader.trouble(u'ERROR: unable to extract video title')
4021             return
4022         video_title = result.group('title').decode('utf-8').strip()
4023         self.report_title(video_title)
4024
4025         # Get the embed page
4026         result = re.search(self.EMBED_PAGE_RE, webpage)
4027         if result is None:
4028             self._downloader.trouble(u'ERROR: unable to extract embed page')
4029             return
4030
4031         embed_page_url = result.group(0).decode('utf-8').strip()
4032         video_id = result.group('videoid').decode('utf-8')
4033         self.report_embed_page(embed_page_url)
4034     
4035         try:
4036             webpage = urllib2.urlopen(embed_page_url).read()
4037         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4038             self._downloader.trouble(u'ERROR: unable to download video embed page: %s' % err)
4039             return
4040         
4041         # Get the video URL
4042         result = re.search(self.SOURCE_RE, webpage)
4043         if result is None:
4044             self._downloader.trouble(u'ERROR: unable to extract video url')
4045             return
4046         video_url = result.group('source').decode('utf-8')
4047         self.report_extract_entry(video_url)
4048
4049         info = {'id': video_id,
4050                 'url': video_url,
4051                 'uploader': None,
4052                 'upload_date': None,
4053                 'title': video_title,
4054                 'ext': 'flv',
4055                 'format': 'flv',
4056                 'thumbnail': None,
4057                 'description': None,
4058                 'player_url': embed_page_url}
4059
4060         return [info]
4061
4062
4063 def gen_extractors():
4064     """ Return a list of an instance of every supported extractor.
4065     The order does matter; the first extractor matched is the one handling the URL.
4066     """
4067     return [
4068         YoutubePlaylistIE(),
4069         YoutubeChannelIE(),
4070         YoutubeUserIE(),
4071         YoutubeSearchIE(),
4072         YoutubeIE(),
4073         MetacafeIE(),
4074         DailymotionIE(),
4075         GoogleSearchIE(),
4076         PhotobucketIE(),
4077         YahooIE(),
4078         YahooSearchIE(),
4079         DepositFilesIE(),
4080         FacebookIE(),
4081         BlipTVUserIE(),
4082         BlipTVIE(),
4083         VimeoIE(),
4084         MyVideoIE(),
4085         ComedyCentralIE(),
4086         EscapistIE(),
4087         CollegeHumorIE(),
4088         XVideosIE(),
4089         SoundcloudIE(),
4090         InfoQIE(),
4091         MixcloudIE(),
4092         StanfordOpenClassroomIE(),
4093         MTVIE(),
4094         YoukuIE(),
4095         XNXXIE(),
4096         YouJizzIE(),
4097         PornotubeIE(),
4098         YouPornIE(),
4099         GooglePlusIE(),
4100         ArteTvIE(),
4101         NBAIE(),
4102         JustinTVIE(),
4103         FunnyOrDieIE(),
4104         TweetReelIE(),
4105         SteamIE(),
4106         UstreamIE(),
4107         GenericIE()
4108     ]
4109
4110