_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37     uploader:       Full name of the video uploader.
  38     upload_date:    Video upload date (YYYYMMDD).
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104     @property
 105     def IE_NAME(self):
 106         return type(self).__name__[:-2]
 107
 108     def _download_webpage(self, url, video_id, note=None, errnote=None):
 109         if note is None:
 110             note = u'Downloading video webpage'
 111         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 112         try:
 113             urlh = compat_urllib_request.urlopen(url)
 114             webpage_bytes = urlh.read()
 115             return webpage_bytes.decode('utf-8', 'replace')
 116         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 117             if errnote is None:
 118                 errnote = u'Unable to download webpage'
 119             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
 120
 121
 122 class YoutubeIE(InfoExtractor):
 123     """Information extractor for youtube.com."""
 124
 125     _VALID_URL = r"""^
 126                      (
 127                          (?:https?://)?                                       # http(s):// (optional)
 128                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 129                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 130                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 131                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 132                          (?:                                                  # the various things that can precede the ID:
 133                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 134                              |(?:                                             # or the v= param in all its forms
 135                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 136                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 137                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 138                                  v=
 139                              )
 140                          )?                                                   # optional -> youtube.com/xxxx is OK
 141                      )?                                                       # all until now is optional -> you can pass the naked ID
 142                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 143                      (?(1).+)?                                                # if we found the ID, everything can follow
 144                      $"""
 145     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 146     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 147     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 148     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 149     _NETRC_MACHINE = 'youtube'
 150     # Listed in order of quality
 151     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 152     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 153     _video_extensions = {
 154         '13': '3gp',
 155         '17': 'mp4',
 156         '18': 'mp4',
 157         '22': 'mp4',
 158         '37': 'mp4',
 159         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 160         '43': 'webm',
 161         '44': 'webm',
 162         '45': 'webm',
 163         '46': 'webm',
 164     }
 165     _video_dimensions = {
 166         '5': '240x400',
 167         '6': '???',
 168         '13': '???',
 169         '17': '144x176',
 170         '18': '360x640',
 171         '22': '720x1280',
 172         '34': '360x640',
 173         '35': '480x854',
 174         '37': '1080x1920',
 175         '38': '3072x4096',
 176         '43': '360x640',
 177         '44': '480x854',
 178         '45': '720x1280',
 179         '46': '1080x1920',
 180     }
 181     IE_NAME = u'youtube'
 182
 183     def suitable(self, url):
 184         """Receives a URL and returns True if suitable for this IE."""
 185         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 186
 187     def report_lang(self):
 188         """Report attempt to set language."""
 189         self._downloader.to_screen(u'[youtube] Setting language')
 190
 191     def report_login(self):
 192         """Report attempt to log in."""
 193         self._downloader.to_screen(u'[youtube] Logging in')
 194
 195     def report_age_confirmation(self):
 196         """Report attempt to confirm age."""
 197         self._downloader.to_screen(u'[youtube] Confirming age')
 198
 199     def report_video_webpage_download(self, video_id):
 200         """Report attempt to download video webpage."""
 201         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 202
 203     def report_video_info_webpage_download(self, video_id):
 204         """Report attempt to download video info webpage."""
 205         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 206
 207     def report_video_subtitles_download(self, video_id):
 208         """Report attempt to download video info webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 210
 211     def report_information_extraction(self, video_id):
 212         """Report attempt to extract video information."""
 213         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 214
 215     def report_unavailable_format(self, video_id, format):
 216         """Report extracted video URL."""
 217         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 218
 219     def report_rtmp_download(self):
 220         """Indicate the download will use the RTMP protocol."""
 221         self._downloader.to_screen(u'[youtube] RTMP download detected')
 222
 223     def _closed_captions_xml_to_srt(self, xml_string):
 224         srt = ''
 225         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 226         # TODO parse xml instead of regex
 227         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 228             if not dur: dur = '4'
 229             start = float(start)
 230             end = start + float(dur)
 231             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 232             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 233             caption = unescapeHTML(caption)
 234             caption = unescapeHTML(caption) # double cycle, intentional
 235             srt += str(n+1) + '\n'
 236             srt += start + ' --> ' + end + '\n'
 237             srt += caption + '\n\n'
 238         return srt
 239
 240     def _extract_subtitles(self, video_id):
 241         self.report_video_subtitles_download(video_id)
 242         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 243         try:
 244             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 245         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 246             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 247         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 248         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 249         if not srt_lang_list:
 250             return (u'WARNING: video has no closed captions', None)
 251         if self._downloader.params.get('subtitleslang', False):
 252             srt_lang = self._downloader.params.get('subtitleslang')
 253         elif 'en' in srt_lang_list:
 254             srt_lang = 'en'
 255         else:
 256             srt_lang = list(srt_lang_list.keys())[0]
 257         if not srt_lang in srt_lang_list:
 258             return (u'WARNING: no closed captions found in the specified language', None)
 259         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 260         try:
 261             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 262         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 263             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 264         if not srt_xml:
 265             return (u'WARNING: unable to download video subtitles', None)
 266         return (None, self._closed_captions_xml_to_srt(srt_xml))
 267
 268     def _print_formats(self, formats):
 269         print('Available formats:')
 270         for x in formats:
 271             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 272
 273     def _real_initialize(self):
 274         if self._downloader is None:
 275             return
 276
 277         username = None
 278         password = None
 279         downloader_params = self._downloader.params
 280
 281         # Attempt to use provided username and password or .netrc data
 282         if downloader_params.get('username', None) is not None:
 283             username = downloader_params['username']
 284             password = downloader_params['password']
 285         elif downloader_params.get('usenetrc', False):
 286             try:
 287                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 288                 if info is not None:
 289                     username = info[0]
 290                     password = info[2]
 291                 else:
 292                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 293             except (IOError, netrc.NetrcParseError) as err:
 294                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 295                 return
 296
 297         # Set language
 298         request = compat_urllib_request.Request(self._LANG_URL)
 299         try:
 300             self.report_lang()
 301             compat_urllib_request.urlopen(request).read()
 302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 303             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 304             return
 305
 306         # No authentication to be performed
 307         if username is None:
 308             return
 309
 310         # Log in
 311         login_form = {
 312                 'current_form': 'loginForm',
 313                 'next':     '/',
 314                 'action_login': 'Log In',
 315                 'username': username,
 316                 'password': password,
 317                 }
 318         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 319         try:
 320             self.report_login()
 321             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 322             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 323                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 324                 return
 325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 326             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 327             return
 328
 329         # Confirm age
 330         age_form = {
 331                 'next_url':     '/',
 332                 'action_confirm':   'Confirm',
 333                 }
 334         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 335         try:
 336             self.report_age_confirmation()
 337             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 339             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 340             return
 341
 342     def _extract_id(self, url):
 343         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 344         if mobj is None:
 345             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 346             return
 347         video_id = mobj.group(2)
 348         return video_id
 349
 350     def _real_extract(self, url):
 351         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 352         mobj = re.search(self._NEXT_URL_RE, url)
 353         if mobj:
 354             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 355         video_id = self._extract_id(url)
 356
 357         # Get video webpage
 358         self.report_video_webpage_download(video_id)
 359         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 360         request = compat_urllib_request.Request(url)
 361         try:
 362             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 364             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 365             return
 366
 367         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 368
 369         # Attempt to extract SWF player URL
 370         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 371         if mobj is not None:
 372             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 373         else:
 374             player_url = None
 375
 376         # Get video info
 377         self.report_video_info_webpage_download(video_id)
 378         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 379             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 380                     % (video_id, el_type))
 381             request = compat_urllib_request.Request(video_info_url)
 382             try:
 383                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 384                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 385                 video_info = compat_parse_qs(video_info_webpage)
 386                 if 'token' in video_info:
 387                     break
 388             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 389                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 390                 return
 391         if 'token' not in video_info:
 392             if 'reason' in video_info:
 393                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 394             else:
 395                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 396             return
 397
 398         # Check for "rental" videos
 399         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 400             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 401             return
 402
 403         # Start extracting information
 404         self.report_information_extraction(video_id)
 405
 406         # uploader
 407         if 'author' not in video_info:
 408             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 409             return
 410         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 411
 412         # uploader_id
 413         video_uploader_id = None
 414         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 415         if mobj is not None:
 416             video_uploader_id = mobj.group(1)
 417         else:
 418             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 419
 420         # title
 421         if 'title' not in video_info:
 422             self._downloader.trouble(u'ERROR: unable to extract video title')
 423             return
 424         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 425
 426         # thumbnail image
 427         if 'thumbnail_url' not in video_info:
 428             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 429             video_thumbnail = ''
 430         else:   # don't panic if we can't find it
 431             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 432
 433         # upload date
 434         upload_date = None
 435         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 436         if mobj is not None:
 437             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 438             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 439             for expression in format_expressions:
 440                 try:
 441                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 442                 except:
 443                     pass
 444
 445         # description
 446         video_description = get_element_by_id("eow-description", video_webpage)
 447         if video_description:
 448             video_description = clean_html(video_description)
 449         else:
 450             video_description = ''
 451
 452         # closed captions
 453         video_subtitles = None
 454         if self._downloader.params.get('writesubtitles', False):
 455             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 456             if srt_error:
 457                 self._downloader.trouble(srt_error)
 458
 459         if 'length_seconds' not in video_info:
 460             self._downloader.trouble(u'WARNING: unable to extract video duration')
 461             video_duration = ''
 462         else:
 463             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 464
 465         # token
 466         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 467
 468         # Decide which formats to download
 469         req_format = self._downloader.params.get('format', None)
 470
 471         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 472             self.report_rtmp_download()
 473             video_url_list = [(None, video_info['conn'][0])]
 474         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 475             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 476             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 477             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 478             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 479
 480             format_limit = self._downloader.params.get('format_limit', None)
 481             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 482             if format_limit is not None and format_limit in available_formats:
 483                 format_list = available_formats[available_formats.index(format_limit):]
 484             else:
 485                 format_list = available_formats
 486             existing_formats = [x for x in format_list if x in url_map]
 487             if len(existing_formats) == 0:
 488                 self._downloader.trouble(u'ERROR: no known formats available for video')
 489                 return
 490             if self._downloader.params.get('listformats', None):
 491                 self._print_formats(existing_formats)
 492                 return
 493             if req_format is None or req_format == 'best':
 494                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 495             elif req_format == 'worst':
 496                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 497             elif req_format in ('-1', 'all'):
 498                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 499             else:
 500                 # Specific formats. We pick the first in a slash-delimeted sequence.
 501                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 502                 req_formats = req_format.split('/')
 503                 video_url_list = None
 504                 for rf in req_formats:
 505                     if rf in url_map:
 506                         video_url_list = [(rf, url_map[rf])]
 507                         break
 508                 if video_url_list is None:
 509                     self._downloader.trouble(u'ERROR: requested format not available')
 510                     return
 511         else:
 512             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 513             return
 514
 515         results = []
 516         for format_param, video_real_url in video_url_list:
 517             # Extension
 518             video_extension = self._video_extensions.get(format_param, 'flv')
 519
 520             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 521                                               self._video_dimensions.get(format_param, '???'))
 522
 523             results.append({
 524                 'id':       video_id,
 525                 'url':      video_real_url,
 526                 'uploader': video_uploader,
 527                 'uploader_id': video_uploader_id,
 528                 'upload_date':  upload_date,
 529                 'title':    video_title,
 530                 'ext':      video_extension,
 531                 'format':   video_format,
 532                 'thumbnail':    video_thumbnail,
 533                 'description':  video_description,
 534                 'player_url':   player_url,
 535                 'subtitles':    video_subtitles,
 536                 'duration':     video_duration
 537             })
 538         return results
 539
 540
 541 class MetacafeIE(InfoExtractor):
 542     """Information Extractor for metacafe.com."""
 543
 544     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 545     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 546     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 547     IE_NAME = u'metacafe'
 548
 549     def __init__(self, downloader=None):
 550         InfoExtractor.__init__(self, downloader)
 551
 552     def report_disclaimer(self):
 553         """Report disclaimer retrieval."""
 554         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 555
 556     def report_age_confirmation(self):
 557         """Report attempt to confirm age."""
 558         self._downloader.to_screen(u'[metacafe] Confirming age')
 559
 560     def report_download_webpage(self, video_id):
 561         """Report webpage download."""
 562         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 563
 564     def report_extraction(self, video_id):
 565         """Report information extraction."""
 566         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 567
 568     def _real_initialize(self):
 569         # Retrieve disclaimer
 570         request = compat_urllib_request.Request(self._DISCLAIMER)
 571         try:
 572             self.report_disclaimer()
 573             disclaimer = compat_urllib_request.urlopen(request).read()
 574         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 575             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 576             return
 577
 578         # Confirm age
 579         disclaimer_form = {
 580             'filters': '0',
 581             'submit': "Continue - I'm over 18",
 582             }
 583         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 584         try:
 585             self.report_age_confirmation()
 586             disclaimer = compat_urllib_request.urlopen(request).read()
 587         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 588             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 589             return
 590
 591     def _real_extract(self, url):
 592         # Extract id and simplified title from URL
 593         mobj = re.match(self._VALID_URL, url)
 594         if mobj is None:
 595             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 596             return
 597
 598         video_id = mobj.group(1)
 599
 600         # Check if video comes from YouTube
 601         mobj2 = re.match(r'^yt-(.*)$', video_id)
 602         if mobj2 is not None:
 603             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 604             return
 605
 606         # Retrieve video webpage to extract further information
 607         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 608         try:
 609             self.report_download_webpage(video_id)
 610             webpage = compat_urllib_request.urlopen(request).read()
 611         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 612             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 613             return
 614
 615         # Extract URL, uploader and title from webpage
 616         self.report_extraction(video_id)
 617         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 618         if mobj is not None:
 619             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 620             video_extension = mediaURL[-3:]
 621
 622             # Extract gdaKey if available
 623             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 624             if mobj is None:
 625                 video_url = mediaURL
 626             else:
 627                 gdaKey = mobj.group(1)
 628                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 629         else:
 630             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 631             if mobj is None:
 632                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 633                 return
 634             vardict = compat_parse_qs(mobj.group(1))
 635             if 'mediaData' not in vardict:
 636                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                 return
 638             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 639             if mobj is None:
 640                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 641                 return
 642             mediaURL = mobj.group(1).replace('\\/', '/')
 643             video_extension = mediaURL[-3:]
 644             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 645
 646         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 647         if mobj is None:
 648             self._downloader.trouble(u'ERROR: unable to extract title')
 649             return
 650         video_title = mobj.group(1).decode('utf-8')
 651
 652         mobj = re.search(r'submitter=(.*?);', webpage)
 653         if mobj is None:
 654             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 655             return
 656         video_uploader = mobj.group(1)
 657
 658         return [{
 659             'id':       video_id.decode('utf-8'),
 660             'url':      video_url.decode('utf-8'),
 661             'uploader': video_uploader.decode('utf-8'),
 662             'upload_date':  None,
 663             'title':    video_title,
 664             'ext':      video_extension.decode('utf-8'),
 665         }]
 666
 667
 668 class DailymotionIE(InfoExtractor):
 669     """Information Extractor for Dailymotion"""
 670
 671     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 672     IE_NAME = u'dailymotion'
 673
 674     def __init__(self, downloader=None):
 675         InfoExtractor.__init__(self, downloader)
 676
 677     def report_download_webpage(self, video_id):
 678         """Report webpage download."""
 679         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 680
 681     def report_extraction(self, video_id):
 682         """Report information extraction."""
 683         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 684
 685     def _real_extract(self, url):
 686         # Extract id and simplified title from URL
 687         mobj = re.match(self._VALID_URL, url)
 688         if mobj is None:
 689             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 690             return
 691
 692         video_id = mobj.group(1).split('_')[0].split('?')[0]
 693
 694         video_extension = 'mp4'
 695
 696         # Retrieve video webpage to extract further information
 697         request = compat_urllib_request.Request(url)
 698         request.add_header('Cookie', 'family_filter=off')
 699         try:
 700             self.report_download_webpage(video_id)
 701             webpage_bytes = compat_urllib_request.urlopen(request).read()
 702             webpage = webpage_bytes.decode('utf-8')
 703         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 704             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 705             return
 706
 707         # Extract URL, uploader and title from webpage
 708         self.report_extraction(video_id)
 709         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 710         if mobj is None:
 711             self._downloader.trouble(u'ERROR: unable to extract media URL')
 712             return
 713         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 714
 715         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 716             if key in flashvars:
 717                 max_quality = key
 718                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 719                 break
 720         else:
 721             self._downloader.trouble(u'ERROR: unable to extract video URL')
 722             return
 723
 724         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 725         if mobj is None:
 726             self._downloader.trouble(u'ERROR: unable to extract video URL')
 727             return
 728
 729         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 730
 731         # TODO: support choosing qualities
 732
 733         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 734         if mobj is None:
 735             self._downloader.trouble(u'ERROR: unable to extract title')
 736             return
 737         video_title = unescapeHTML(mobj.group('title'))
 738
 739         video_uploader = None
 740         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 741         if mobj is None:
 742             # lookin for official user
 743             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 744             if mobj_official is None:
 745                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 746             else:
 747                 video_uploader = mobj_official.group(1)
 748         else:
 749             video_uploader = mobj.group(1)
 750
 751         video_upload_date = None
 752         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 753         if mobj is not None:
 754             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 755
 756         return [{
 757             'id':       video_id,
 758             'url':      video_url,
 759             'uploader': video_uploader,
 760             'upload_date':  video_upload_date,
 761             'title':    video_title,
 762             'ext':      video_extension,
 763         }]
 764
 765
 766 class PhotobucketIE(InfoExtractor):
 767     """Information extractor for photobucket.com."""
 768
 769     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 770     IE_NAME = u'photobucket'
 771
 772     def __init__(self, downloader=None):
 773         InfoExtractor.__init__(self, downloader)
 774
 775     def report_download_webpage(self, video_id):
 776         """Report webpage download."""
 777         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 778
 779     def report_extraction(self, video_id):
 780         """Report information extraction."""
 781         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 782
 783     def _real_extract(self, url):
 784         # Extract id from URL
 785         mobj = re.match(self._VALID_URL, url)
 786         if mobj is None:
 787             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 788             return
 789
 790         video_id = mobj.group(1)
 791
 792         video_extension = 'flv'
 793
 794         # Retrieve video webpage to extract further information
 795         request = compat_urllib_request.Request(url)
 796         try:
 797             self.report_download_webpage(video_id)
 798             webpage = compat_urllib_request.urlopen(request).read()
 799         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 800             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 801             return
 802
 803         # Extract URL, uploader, and title from webpage
 804         self.report_extraction(video_id)
 805         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 806         if mobj is None:
 807             self._downloader.trouble(u'ERROR: unable to extract media URL')
 808             return
 809         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810
 811         video_url = mediaURL
 812
 813         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 814         if mobj is None:
 815             self._downloader.trouble(u'ERROR: unable to extract title')
 816             return
 817         video_title = mobj.group(1).decode('utf-8')
 818
 819         video_uploader = mobj.group(2).decode('utf-8')
 820
 821         return [{
 822             'id':       video_id.decode('utf-8'),
 823             'url':      video_url.decode('utf-8'),
 824             'uploader': video_uploader,
 825             'upload_date':  None,
 826             'title':    video_title,
 827             'ext':      video_extension.decode('utf-8'),
 828         }]
 829
 830
 831 class YahooIE(InfoExtractor):
 832     """Information extractor for video.yahoo.com."""
 833
 834     _WORKING = False
 835     # _VALID_URL matches all Yahoo! Video URLs
 836     # _VPAGE_URL matches only the extractable '/watch/' URLs
 837     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 838     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 839     IE_NAME = u'video.yahoo'
 840
 841     def __init__(self, downloader=None):
 842         InfoExtractor.__init__(self, downloader)
 843
 844     def report_download_webpage(self, video_id):
 845         """Report webpage download."""
 846         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 847
 848     def report_extraction(self, video_id):
 849         """Report information extraction."""
 850         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 851
 852     def _real_extract(self, url, new_video=True):
 853         # Extract ID from URL
 854         mobj = re.match(self._VALID_URL, url)
 855         if mobj is None:
 856             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 857             return
 858
 859         video_id = mobj.group(2)
 860         video_extension = 'flv'
 861
 862         # Rewrite valid but non-extractable URLs as
 863         # extractable English language /watch/ URLs
 864         if re.match(self._VPAGE_URL, url) is None:
 865             request = compat_urllib_request.Request(url)
 866             try:
 867                 webpage = compat_urllib_request.urlopen(request).read()
 868             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 869                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 870                 return
 871
 872             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 873             if mobj is None:
 874                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 875                 return
 876             yahoo_id = mobj.group(1)
 877
 878             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 879             if mobj is None:
 880                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 881                 return
 882             yahoo_vid = mobj.group(1)
 883
 884             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 885             return self._real_extract(url, new_video=False)
 886
 887         # Retrieve video webpage to extract further information
 888         request = compat_urllib_request.Request(url)
 889         try:
 890             self.report_download_webpage(video_id)
 891             webpage = compat_urllib_request.urlopen(request).read()
 892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 893             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 894             return
 895
 896         # Extract uploader and title from webpage
 897         self.report_extraction(video_id)
 898         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 899         if mobj is None:
 900             self._downloader.trouble(u'ERROR: unable to extract video title')
 901             return
 902         video_title = mobj.group(1).decode('utf-8')
 903
 904         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 905         if mobj is None:
 906             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 907             return
 908         video_uploader = mobj.group(1).decode('utf-8')
 909
 910         # Extract video thumbnail
 911         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 912         if mobj is None:
 913             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 914             return
 915         video_thumbnail = mobj.group(1).decode('utf-8')
 916
 917         # Extract video description
 918         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 919         if mobj is None:
 920             self._downloader.trouble(u'ERROR: unable to extract video description')
 921             return
 922         video_description = mobj.group(1).decode('utf-8')
 923         if not video_description:
 924             video_description = 'No description available.'
 925
 926         # Extract video height and width
 927         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 928         if mobj is None:
 929             self._downloader.trouble(u'ERROR: unable to extract video height')
 930             return
 931         yv_video_height = mobj.group(1)
 932
 933         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 934         if mobj is None:
 935             self._downloader.trouble(u'ERROR: unable to extract video width')
 936             return
 937         yv_video_width = mobj.group(1)
 938
 939         # Retrieve video playlist to extract media URL
 940         # I'm not completely sure what all these options are, but we
 941         # seem to need most of them, otherwise the server sends a 401.
 942         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 943         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 944         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 945                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 946                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 947         try:
 948             self.report_download_webpage(video_id)
 949             webpage = compat_urllib_request.urlopen(request).read()
 950         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 951             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 952             return
 953
 954         # Extract media URL from playlist XML
 955         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 956         if mobj is None:
 957             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 958             return
 959         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 960         video_url = unescapeHTML(video_url)
 961
 962         return [{
 963             'id':       video_id.decode('utf-8'),
 964             'url':      video_url,
 965             'uploader': video_uploader,
 966             'upload_date':  None,
 967             'title':    video_title,
 968             'ext':      video_extension.decode('utf-8'),
 969             'thumbnail':    video_thumbnail.decode('utf-8'),
 970             'description':  video_description,
 971         }]
 972
 973
 974 class VimeoIE(InfoExtractor):
 975     """Information extractor for vimeo.com."""
 976
 977     # _VALID_URL matches Vimeo URLs
 978     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 979     IE_NAME = u'vimeo'
 980
 981     def __init__(self, downloader=None):
 982         InfoExtractor.__init__(self, downloader)
 983
 984     def report_download_webpage(self, video_id):
 985         """Report webpage download."""
 986         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 987
 988     def report_extraction(self, video_id):
 989         """Report information extraction."""
 990         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 991
 992     def _real_extract(self, url, new_video=True):
 993         # Extract ID from URL
 994         mobj = re.match(self._VALID_URL, url)
 995         if mobj is None:
 996             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 997             return
 998
 999         video_id = mobj.group(1)
1000
1001         # Retrieve video webpage to extract further information
1002         request = compat_urllib_request.Request(url, None, std_headers)
1003         try:
1004             self.report_download_webpage(video_id)
1005             webpage_bytes = compat_urllib_request.urlopen(request).read()
1006             webpage = webpage_bytes.decode('utf-8')
1007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009             return
1010
1011         # Now we begin extracting as much information as we can from what we
1012         # retrieved. First we extract the information common to all extractors,
1013         # and latter we extract those that are Vimeo specific.
1014         self.report_extraction(video_id)
1015
1016         # Extract the config JSON
1017         try:
1018             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1019             config = json.loads(config)
1020         except:
1021             self._downloader.trouble(u'ERROR: unable to extract info section')
1022             return
1023
1024         # Extract title
1025         video_title = config["video"]["title"]
1026
1027         # Extract uploader and uploader_id
1028         video_uploader = config["video"]["owner"]["name"]
1029         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1030
1031         # Extract video thumbnail
1032         video_thumbnail = config["video"]["thumbnail"]
1033
1034         # Extract video description
1035         video_description = get_element_by_attribute("itemprop", "description", webpage)
1036         if video_description: video_description = clean_html(video_description)
1037         else: video_description = ''
1038
1039         # Extract upload date
1040         video_upload_date = None
1041         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1042         if mobj is not None:
1043             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1044
1045         # Vimeo specific: extract request signature and timestamp
1046         sig = config['request']['signature']
1047         timestamp = config['request']['timestamp']
1048
1049         # Vimeo specific: extract video codec and quality information
1050         # First consider quality, then codecs, then take everything
1051         # TODO bind to format param
1052         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053         files = { 'hd': [], 'sd': [], 'other': []}
1054         for codec_name, codec_extension in codecs:
1055             if codec_name in config["video"]["files"]:
1056                 if 'hd' in config["video"]["files"][codec_name]:
1057                     files['hd'].append((codec_name, codec_extension, 'hd'))
1058                 elif 'sd' in config["video"]["files"][codec_name]:
1059                     files['sd'].append((codec_name, codec_extension, 'sd'))
1060                 else:
1061                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1062
1063         for quality in ('hd', 'sd', 'other'):
1064             if len(files[quality]) > 0:
1065                 video_quality = files[quality][0][2]
1066                 video_codec = files[quality][0][0]
1067                 video_extension = files[quality][0][1]
1068                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1069                 break
1070         else:
1071             self._downloader.trouble(u'ERROR: no known codec found')
1072             return
1073
1074         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1075                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1076
1077         return [{
1078             'id':       video_id,
1079             'url':      video_url,
1080             'uploader': video_uploader,
1081             'uploader_id': video_uploader_id,
1082             'upload_date':  video_upload_date,
1083             'title':    video_title,
1084             'ext':      video_extension,
1085             'thumbnail':    video_thumbnail,
1086             'description':  video_description,
1087         }]
1088
1089
1090 class ArteTvIE(InfoExtractor):
1091     """arte.tv information extractor."""
1092
1093     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1094     _LIVE_URL = r'index-[0-9]+\.html$'
1095
1096     IE_NAME = u'arte.tv'
1097
1098     def __init__(self, downloader=None):
1099         InfoExtractor.__init__(self, downloader)
1100
1101     def report_download_webpage(self, video_id):
1102         """Report webpage download."""
1103         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1104
1105     def report_extraction(self, video_id):
1106         """Report information extraction."""
1107         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1108
1109     def fetch_webpage(self, url):
1110         request = compat_urllib_request.Request(url)
1111         try:
1112             self.report_download_webpage(url)
1113             webpage = compat_urllib_request.urlopen(request).read()
1114         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1115             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1116             return
1117         except ValueError as err:
1118             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1119             return
1120         return webpage
1121
1122     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1123         page = self.fetch_webpage(url)
1124         mobj = re.search(regex, page, regexFlags)
1125         info = {}
1126
1127         if mobj is None:
1128             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1129             return
1130
1131         for (i, key, err) in matchTuples:
1132             if mobj.group(i) is None:
1133                 self._downloader.trouble(err)
1134                 return
1135             else:
1136                 info[key] = mobj.group(i)
1137
1138         return info
1139
1140     def extractLiveStream(self, url):
1141         video_lang = url.split('/')[-4]
1142         info = self.grep_webpage(
1143             url,
1144             r'src="(.*?/videothek_js.*?\.js)',
1145             0,
1146             [
1147                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1148             ]
1149         )
1150         http_host = url.split('/')[2]
1151         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1152         info = self.grep_webpage(
1153             next_url,
1154             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1155                 '(http://.*?\.swf).*?' +
1156                 '(rtmp://.*?)\'',
1157             re.DOTALL,
1158             [
1159                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1160                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1161                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1162             ]
1163         )
1164         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1165
1166     def extractPlus7Stream(self, url):
1167         video_lang = url.split('/')[-3]
1168         info = self.grep_webpage(
1169             url,
1170             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1171             0,
1172             [
1173                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1174             ]
1175         )
1176         next_url = compat_urllib_parse.unquote(info.get('url'))
1177         info = self.grep_webpage(
1178             next_url,
1179             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1180             0,
1181             [
1182                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1183             ]
1184         )
1185         next_url = compat_urllib_parse.unquote(info.get('url'))
1186
1187         info = self.grep_webpage(
1188             next_url,
1189             r'<video id="(.*?)".*?>.*?' +
1190                 '<name>(.*?)</name>.*?' +
1191                 '<dateVideo>(.*?)</dateVideo>.*?' +
1192                 '<url quality="hd">(.*?)</url>',
1193             re.DOTALL,
1194             [
1195                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1196                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1197                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1198                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1199             ]
1200         )
1201
1202         return {
1203             'id':           info.get('id'),
1204             'url':          compat_urllib_parse.unquote(info.get('url')),
1205             'uploader':     u'arte.tv',
1206             'upload_date':  info.get('date'),
1207             'title':        info.get('title').decode('utf-8'),
1208             'ext':          u'mp4',
1209             'format':       u'NA',
1210             'player_url':   None,
1211         }
1212
1213     def _real_extract(self, url):
1214         video_id = url.split('/')[-1]
1215         self.report_extraction(video_id)
1216
1217         if re.search(self._LIVE_URL, video_id) is not None:
1218             self.extractLiveStream(url)
1219             return
1220         else:
1221             info = self.extractPlus7Stream(url)
1222
1223         return [info]
1224
1225
1226 class GenericIE(InfoExtractor):
1227     """Generic last-resort information extractor."""
1228
1229     _VALID_URL = r'.*'
1230     IE_NAME = u'generic'
1231
1232     def __init__(self, downloader=None):
1233         InfoExtractor.__init__(self, downloader)
1234
1235     def report_download_webpage(self, video_id):
1236         """Report webpage download."""
1237         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1238         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1239
1240     def report_extraction(self, video_id):
1241         """Report information extraction."""
1242         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1243
1244     def report_following_redirect(self, new_url):
1245         """Report information extraction."""
1246         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1247
1248     def _test_redirect(self, url):
1249         """Check if it is a redirect, like url shorteners, in case restart chain."""
1250         class HeadRequest(compat_urllib_request.Request):
1251             def get_method(self):
1252                 return "HEAD"
1253
1254         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1255             """
1256             Subclass the HTTPRedirectHandler to make it use our
1257             HeadRequest also on the redirected URL
1258             """
1259             def redirect_request(self, req, fp, code, msg, headers, newurl):
1260                 if code in (301, 302, 303, 307):
1261                     newurl = newurl.replace(' ', '%20')
1262                     newheaders = dict((k,v) for k,v in req.headers.items()
1263                                       if k.lower() not in ("content-length", "content-type"))
1264                     return HeadRequest(newurl,
1265                                        headers=newheaders,
1266                                        origin_req_host=req.get_origin_req_host(),
1267                                        unverifiable=True)
1268                 else:
1269                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1270
1271         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1272             """
1273             Fallback to GET if HEAD is not allowed (405 HTTP error)
1274             """
1275             def http_error_405(self, req, fp, code, msg, headers):
1276                 fp.read()
1277                 fp.close()
1278
1279                 newheaders = dict((k,v) for k,v in req.headers.items()
1280                                   if k.lower() not in ("content-length", "content-type"))
1281                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1282                                                  headers=newheaders,
1283                                                  origin_req_host=req.get_origin_req_host(),
1284                                                  unverifiable=True))
1285
1286         # Build our opener
1287         opener = compat_urllib_request.OpenerDirector()
1288         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1289                         HTTPMethodFallback, HEADRedirectHandler,
1290                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1291             opener.add_handler(handler())
1292
1293         response = opener.open(HeadRequest(url))
1294         new_url = response.geturl()
1295
1296         if url == new_url:
1297             return False
1298
1299         self.report_following_redirect(new_url)
1300         self._downloader.download([new_url])
1301         return True
1302
1303     def _real_extract(self, url):
1304         if self._test_redirect(url): return
1305
1306         video_id = url.split('/')[-1]
1307         request = compat_urllib_request.Request(url)
1308         try:
1309             self.report_download_webpage(video_id)
1310             webpage = compat_urllib_request.urlopen(request).read()
1311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1312             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1313             return
1314         except ValueError as err:
1315             # since this is the last-resort InfoExtractor, if
1316             # this error is thrown, it'll be thrown here
1317             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318             return
1319
1320         self.report_extraction(video_id)
1321         # Start with something easy: JW Player in SWFObject
1322         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1323         if mobj is None:
1324             # Broaden the search a little bit
1325             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1326         if mobj is None:
1327             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1328             return
1329
1330         # It's possible that one of the regexes
1331         # matched, but returned an empty group:
1332         if mobj.group(1) is None:
1333             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1334             return
1335
1336         video_url = compat_urllib_parse.unquote(mobj.group(1))
1337         video_id = os.path.basename(video_url)
1338
1339         # here's a fun little line of code for you:
1340         video_extension = os.path.splitext(video_id)[1][1:]
1341         video_id = os.path.splitext(video_id)[0]
1342
1343         # it's tempting to parse this further, but you would
1344         # have to take into account all the variations like
1345         #   Video Title - Site Name
1346         #   Site Name | Video Title
1347         #   Video Title - Tagline | Site Name
1348         # and so on and so forth; it's just not practical
1349         mobj = re.search(r'<title>(.*)</title>', webpage)
1350         if mobj is None:
1351             self._downloader.trouble(u'ERROR: unable to extract title')
1352             return
1353         video_title = mobj.group(1)
1354
1355         # video uploader is domain name
1356         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1357         if mobj is None:
1358             self._downloader.trouble(u'ERROR: unable to extract title')
1359             return
1360         video_uploader = mobj.group(1)
1361
1362         return [{
1363             'id':       video_id,
1364             'url':      video_url,
1365             'uploader': video_uploader,
1366             'upload_date':  None,
1367             'title':    video_title,
1368             'ext':      video_extension,
1369         }]
1370
1371
1372 class YoutubeSearchIE(InfoExtractor):
1373     """Information Extractor for YouTube search queries."""
1374     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1375     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1376     _max_youtube_results = 1000
1377     IE_NAME = u'youtube:search'
1378
1379     def __init__(self, downloader=None):
1380         InfoExtractor.__init__(self, downloader)
1381
1382     def report_download_page(self, query, pagenum):
1383         """Report attempt to download search page with given number."""
1384         query = query.decode(preferredencoding())
1385         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1386
1387     def _real_extract(self, query):
1388         mobj = re.match(self._VALID_URL, query)
1389         if mobj is None:
1390             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1391             return
1392
1393         prefix, query = query.split(':')
1394         prefix = prefix[8:]
1395         query = query.encode('utf-8')
1396         if prefix == '':
1397             self._download_n_results(query, 1)
1398             return
1399         elif prefix == 'all':
1400             self._download_n_results(query, self._max_youtube_results)
1401             return
1402         else:
1403             try:
1404                 n = int(prefix)
1405                 if n <= 0:
1406                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1407                     return
1408                 elif n > self._max_youtube_results:
1409                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1410                     n = self._max_youtube_results
1411                 self._download_n_results(query, n)
1412                 return
1413             except ValueError: # parsing prefix as integer fails
1414                 self._download_n_results(query, 1)
1415                 return
1416
1417     def _download_n_results(self, query, n):
1418         """Downloads a specified number of results for a query"""
1419
1420         video_ids = []
1421         pagenum = 0
1422         limit = n
1423
1424         while (50 * pagenum) < limit:
1425             self.report_download_page(query, pagenum+1)
1426             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1427             request = compat_urllib_request.Request(result_url)
1428             try:
1429                 data = compat_urllib_request.urlopen(request).read()
1430             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1431                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1432                 return
1433             api_response = json.loads(data)['data']
1434
1435             new_ids = list(video['id'] for video in api_response['items'])
1436             video_ids += new_ids
1437
1438             limit = min(n, api_response['totalItems'])
1439             pagenum += 1
1440
1441         if len(video_ids) > n:
1442             video_ids = video_ids[:n]
1443         for id in video_ids:
1444             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1445         return
1446
1447
1448 class GoogleSearchIE(InfoExtractor):
1449     """Information Extractor for Google Video search queries."""
1450     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1451     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1452     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1453     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1454     _max_google_results = 1000
1455     IE_NAME = u'video.google:search'
1456
1457     def __init__(self, downloader=None):
1458         InfoExtractor.__init__(self, downloader)
1459
1460     def report_download_page(self, query, pagenum):
1461         """Report attempt to download playlist page with given number."""
1462         query = query.decode(preferredencoding())
1463         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1464
1465     def _real_extract(self, query):
1466         mobj = re.match(self._VALID_URL, query)
1467         if mobj is None:
1468             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1469             return
1470
1471         prefix, query = query.split(':')
1472         prefix = prefix[8:]
1473         query = query.encode('utf-8')
1474         if prefix == '':
1475             self._download_n_results(query, 1)
1476             return
1477         elif prefix == 'all':
1478             self._download_n_results(query, self._max_google_results)
1479             return
1480         else:
1481             try:
1482                 n = int(prefix)
1483                 if n <= 0:
1484                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1485                     return
1486                 elif n > self._max_google_results:
1487                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1488                     n = self._max_google_results
1489                 self._download_n_results(query, n)
1490                 return
1491             except ValueError: # parsing prefix as integer fails
1492                 self._download_n_results(query, 1)
1493                 return
1494
1495     def _download_n_results(self, query, n):
1496         """Downloads a specified number of results for a query"""
1497
1498         video_ids = []
1499         pagenum = 0
1500
1501         while True:
1502             self.report_download_page(query, pagenum)
1503             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1504             request = compat_urllib_request.Request(result_url)
1505             try:
1506                 page = compat_urllib_request.urlopen(request).read()
1507             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1508                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509                 return
1510
1511             # Extract video identifiers
1512             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513                 video_id = mobj.group(1)
1514                 if video_id not in video_ids:
1515                     video_ids.append(video_id)
1516                     if len(video_ids) == n:
1517                         # Specified n videos reached
1518                         for id in video_ids:
1519                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1520                         return
1521
1522             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1523                 for id in video_ids:
1524                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1525                 return
1526
1527             pagenum = pagenum + 1
1528
1529
1530 class YahooSearchIE(InfoExtractor):
1531     """Information Extractor for Yahoo! Video search queries."""
1532
1533     _WORKING = False
1534     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1535     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1536     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1537     _MORE_PAGES_INDICATOR = r'\s*Next'
1538     _max_yahoo_results = 1000
1539     IE_NAME = u'video.yahoo:search'
1540
1541     def __init__(self, downloader=None):
1542         InfoExtractor.__init__(self, downloader)
1543
1544     def report_download_page(self, query, pagenum):
1545         """Report attempt to download playlist page with given number."""
1546         query = query.decode(preferredencoding())
1547         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1548
1549     def _real_extract(self, query):
1550         mobj = re.match(self._VALID_URL, query)
1551         if mobj is None:
1552             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1553             return
1554
1555         prefix, query = query.split(':')
1556         prefix = prefix[8:]
1557         query = query.encode('utf-8')
1558         if prefix == '':
1559             self._download_n_results(query, 1)
1560             return
1561         elif prefix == 'all':
1562             self._download_n_results(query, self._max_yahoo_results)
1563             return
1564         else:
1565             try:
1566                 n = int(prefix)
1567                 if n <= 0:
1568                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1569                     return
1570                 elif n > self._max_yahoo_results:
1571                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1572                     n = self._max_yahoo_results
1573                 self._download_n_results(query, n)
1574                 return
1575             except ValueError: # parsing prefix as integer fails
1576                 self._download_n_results(query, 1)
1577                 return
1578
1579     def _download_n_results(self, query, n):
1580         """Downloads a specified number of results for a query"""
1581
1582         video_ids = []
1583         already_seen = set()
1584         pagenum = 1
1585
1586         while True:
1587             self.report_download_page(query, pagenum)
1588             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1589             request = compat_urllib_request.Request(result_url)
1590             try:
1591                 page = compat_urllib_request.urlopen(request).read()
1592             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1593                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1594                 return
1595
1596             # Extract video identifiers
1597             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1598                 video_id = mobj.group(1)
1599                 if video_id not in already_seen:
1600                     video_ids.append(video_id)
1601                     already_seen.add(video_id)
1602                     if len(video_ids) == n:
1603                         # Specified n videos reached
1604                         for id in video_ids:
1605                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1606                         return
1607
1608             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1609                 for id in video_ids:
1610                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1611                 return
1612
1613             pagenum = pagenum + 1
1614
1615
1616 class YoutubePlaylistIE(InfoExtractor):
1617     """Information Extractor for YouTube playlists."""
1618
1619     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1620     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1621     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1622     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1623     IE_NAME = u'youtube:playlist'
1624
1625     def __init__(self, downloader=None):
1626         InfoExtractor.__init__(self, downloader)
1627
1628     def report_download_page(self, playlist_id, pagenum):
1629         """Report attempt to download playlist page with given number."""
1630         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1631
1632     def _real_extract(self, url):
1633         # Extract playlist id
1634         mobj = re.match(self._VALID_URL, url)
1635         if mobj is None:
1636             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1637             return
1638
1639         # Single video case
1640         if mobj.group(3) is not None:
1641             self._downloader.download([mobj.group(3)])
1642             return
1643
1644         # Download playlist pages
1645         # prefix is 'p' as default for playlists but there are other types that need extra care
1646         playlist_prefix = mobj.group(1)
1647         if playlist_prefix == 'a':
1648             playlist_access = 'artist'
1649         else:
1650             playlist_prefix = 'p'
1651             playlist_access = 'view_play_list'
1652         playlist_id = mobj.group(2)
1653         video_ids = []
1654         pagenum = 1
1655
1656         while True:
1657             self.report_download_page(playlist_id, pagenum)
1658             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1659             request = compat_urllib_request.Request(url)
1660             try:
1661                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1662             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1663                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1664                 return
1665
1666             # Extract video identifiers
1667             ids_in_page = []
1668             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1669                 if mobj.group(1) not in ids_in_page:
1670                     ids_in_page.append(mobj.group(1))
1671             video_ids.extend(ids_in_page)
1672
1673             if self._MORE_PAGES_INDICATOR not in page:
1674                 break
1675             pagenum = pagenum + 1
1676
1677         total = len(video_ids)
1678
1679         playliststart = self._downloader.params.get('playliststart', 1) - 1
1680         playlistend = self._downloader.params.get('playlistend', -1)
1681         if playlistend == -1:
1682             video_ids = video_ids[playliststart:]
1683         else:
1684             video_ids = video_ids[playliststart:playlistend]
1685
1686         if len(video_ids) == total:
1687             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1688         else:
1689             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1690
1691         for id in video_ids:
1692             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1693         return
1694
1695
1696 class YoutubeChannelIE(InfoExtractor):
1697     """Information Extractor for YouTube channels."""
1698
1699     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1700     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1701     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1702     IE_NAME = u'youtube:channel'
1703
1704     def report_download_page(self, channel_id, pagenum):
1705         """Report attempt to download channel page with given number."""
1706         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1707
1708     def _real_extract(self, url):
1709         # Extract channel id
1710         mobj = re.match(self._VALID_URL, url)
1711         if mobj is None:
1712             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1713             return
1714
1715         # Download channel pages
1716         channel_id = mobj.group(1)
1717         video_ids = []
1718         pagenum = 1
1719
1720         while True:
1721             self.report_download_page(channel_id, pagenum)
1722             url = self._TEMPLATE_URL % (channel_id, pagenum)
1723             request = compat_urllib_request.Request(url)
1724             try:
1725                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1726             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728                 return
1729
1730             # Extract video identifiers
1731             ids_in_page = []
1732             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1733                 if mobj.group(1) not in ids_in_page:
1734                     ids_in_page.append(mobj.group(1))
1735             video_ids.extend(ids_in_page)
1736
1737             if self._MORE_PAGES_INDICATOR not in page:
1738                 break
1739             pagenum = pagenum + 1
1740
1741         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1742
1743         for id in video_ids:
1744             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1745         return
1746
1747
1748 class YoutubeUserIE(InfoExtractor):
1749     """Information Extractor for YouTube users."""
1750
1751     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1752     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1753     _GDATA_PAGE_SIZE = 50
1754     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1755     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1756     IE_NAME = u'youtube:user'
1757
1758     def __init__(self, downloader=None):
1759         InfoExtractor.__init__(self, downloader)
1760
1761     def report_download_page(self, username, start_index):
1762         """Report attempt to download user page."""
1763         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1764                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1765
1766     def _real_extract(self, url):
1767         # Extract username
1768         mobj = re.match(self._VALID_URL, url)
1769         if mobj is None:
1770             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1771             return
1772
1773         username = mobj.group(1)
1774
1775         # Download video ids using YouTube Data API. Result size per
1776         # query is limited (currently to 50 videos) so we need to query
1777         # page by page until there are no video ids - it means we got
1778         # all of them.
1779
1780         video_ids = []
1781         pagenum = 0
1782
1783         while True:
1784             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1785             self.report_download_page(username, start_index)
1786
1787             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1788
1789             try:
1790                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1791             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793                 return
1794
1795             # Extract video identifiers
1796             ids_in_page = []
1797
1798             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1799                 if mobj.group(1) not in ids_in_page:
1800                     ids_in_page.append(mobj.group(1))
1801
1802             video_ids.extend(ids_in_page)
1803
1804             # A little optimization - if current page is not
1805             # "full", ie. does not contain PAGE_SIZE video ids then
1806             # we can assume that this page is the last one - there
1807             # are no more ids on further pages - no need to query
1808             # again.
1809
1810             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1811                 break
1812
1813             pagenum += 1
1814
1815         all_ids_count = len(video_ids)
1816         playliststart = self._downloader.params.get('playliststart', 1) - 1
1817         playlistend = self._downloader.params.get('playlistend', -1)
1818
1819         if playlistend == -1:
1820             video_ids = video_ids[playliststart:]
1821         else:
1822             video_ids = video_ids[playliststart:playlistend]
1823
1824         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1825                 (username, all_ids_count, len(video_ids)))
1826
1827         for video_id in video_ids:
1828             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1829
1830
1831 class BlipTVUserIE(InfoExtractor):
1832     """Information Extractor for blip.tv users."""
1833
1834     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1835     _PAGE_SIZE = 12
1836     IE_NAME = u'blip.tv:user'
1837
1838     def __init__(self, downloader=None):
1839         InfoExtractor.__init__(self, downloader)
1840
1841     def report_download_page(self, username, pagenum):
1842         """Report attempt to download user page."""
1843         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1844                 (self.IE_NAME, username, pagenum))
1845
1846     def _real_extract(self, url):
1847         # Extract username
1848         mobj = re.match(self._VALID_URL, url)
1849         if mobj is None:
1850             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1851             return
1852
1853         username = mobj.group(1)
1854
1855         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1856
1857         request = compat_urllib_request.Request(url)
1858
1859         try:
1860             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1861             mobj = re.search(r'data-users-id="([^"]+)"', page)
1862             page_base = page_base % mobj.group(1)
1863         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1864             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1865             return
1866
1867
1868         # Download video ids using BlipTV Ajax calls. Result size per
1869         # query is limited (currently to 12 videos) so we need to query
1870         # page by page until there are no video ids - it means we got
1871         # all of them.
1872
1873         video_ids = []
1874         pagenum = 1
1875
1876         while True:
1877             self.report_download_page(username, pagenum)
1878
1879             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1880
1881             try:
1882                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1883             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1884                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1885                 return
1886
1887             # Extract video identifiers
1888             ids_in_page = []
1889
1890             for mobj in re.finditer(r'href="/([^"]+)"', page):
1891                 if mobj.group(1) not in ids_in_page:
1892                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1893
1894             video_ids.extend(ids_in_page)
1895
1896             # A little optimization - if current page is not
1897             # "full", ie. does not contain PAGE_SIZE video ids then
1898             # we can assume that this page is the last one - there
1899             # are no more ids on further pages - no need to query
1900             # again.
1901
1902             if len(ids_in_page) < self._PAGE_SIZE:
1903                 break
1904
1905             pagenum += 1
1906
1907         all_ids_count = len(video_ids)
1908         playliststart = self._downloader.params.get('playliststart', 1) - 1
1909         playlistend = self._downloader.params.get('playlistend', -1)
1910
1911         if playlistend == -1:
1912             video_ids = video_ids[playliststart:]
1913         else:
1914             video_ids = video_ids[playliststart:playlistend]
1915
1916         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1917                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1918
1919         for video_id in video_ids:
1920             self._downloader.download([u'http://blip.tv/'+video_id])
1921
1922
1923 class DepositFilesIE(InfoExtractor):
1924     """Information extractor for depositfiles.com"""
1925
1926     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1927
1928     def report_download_webpage(self, file_id):
1929         """Report webpage download."""
1930         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1931
1932     def report_extraction(self, file_id):
1933         """Report information extraction."""
1934         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1935
1936     def _real_extract(self, url):
1937         file_id = url.split('/')[-1]
1938         # Rebuild url in english locale
1939         url = 'http://depositfiles.com/en/files/' + file_id
1940
1941         # Retrieve file webpage with 'Free download' button pressed
1942         free_download_indication = { 'gateway_result' : '1' }
1943         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1944         try:
1945             self.report_download_webpage(file_id)
1946             webpage = compat_urllib_request.urlopen(request).read()
1947         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1949             return
1950
1951         # Search for the real file URL
1952         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1953         if (mobj is None) or (mobj.group(1) is None):
1954             # Try to figure out reason of the error.
1955             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1956             if (mobj is not None) and (mobj.group(1) is not None):
1957                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1958                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1959             else:
1960                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1961             return
1962
1963         file_url = mobj.group(1)
1964         file_extension = os.path.splitext(file_url)[1][1:]
1965
1966         # Search for file title
1967         mobj = re.search(r'<b title="(.*?)">', webpage)
1968         if mobj is None:
1969             self._downloader.trouble(u'ERROR: unable to extract title')
1970             return
1971         file_title = mobj.group(1).decode('utf-8')
1972
1973         return [{
1974             'id':       file_id.decode('utf-8'),
1975             'url':      file_url.decode('utf-8'),
1976             'uploader': None,
1977             'upload_date':  None,
1978             'title':    file_title,
1979             'ext':      file_extension.decode('utf-8'),
1980         }]
1981
1982
1983 class FacebookIE(InfoExtractor):
1984     """Information Extractor for Facebook"""
1985
1986     _WORKING = False
1987     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1988     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1989     _NETRC_MACHINE = 'facebook'
1990     _available_formats = ['video', 'highqual', 'lowqual']
1991     _video_extensions = {
1992         'video': 'mp4',
1993         'highqual': 'mp4',
1994         'lowqual': 'mp4',
1995     }
1996     IE_NAME = u'facebook'
1997
1998     def __init__(self, downloader=None):
1999         InfoExtractor.__init__(self, downloader)
2000
2001     def _reporter(self, message):
2002         """Add header and report message."""
2003         self._downloader.to_screen(u'[facebook] %s' % message)
2004
2005     def report_login(self):
2006         """Report attempt to log in."""
2007         self._reporter(u'Logging in')
2008
2009     def report_video_webpage_download(self, video_id):
2010         """Report attempt to download video webpage."""
2011         self._reporter(u'%s: Downloading video webpage' % video_id)
2012
2013     def report_information_extraction(self, video_id):
2014         """Report attempt to extract video information."""
2015         self._reporter(u'%s: Extracting video information' % video_id)
2016
2017     def _parse_page(self, video_webpage):
2018         """Extract video information from page"""
2019         # General data
2020         data = {'title': r'\("video_title", "(.*?)"\)',
2021             'description': r'<div class="datawrap">(.*?)</div>',
2022             'owner': r'\("video_owner_name", "(.*?)"\)',
2023             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2024             }
2025         video_info = {}
2026         for piece in data.keys():
2027             mobj = re.search(data[piece], video_webpage)
2028             if mobj is not None:
2029                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030
2031         # Video urls
2032         video_urls = {}
2033         for fmt in self._available_formats:
2034             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2035             if mobj is not None:
2036                 # URL is in a Javascript segment inside an escaped Unicode format within
2037                 # the generally utf-8 page
2038                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2039         video_info['video_urls'] = video_urls
2040
2041         return video_info
2042
2043     def _real_initialize(self):
2044         if self._downloader is None:
2045             return
2046
2047         useremail = None
2048         password = None
2049         downloader_params = self._downloader.params
2050
2051         # Attempt to use provided username and password or .netrc data
2052         if downloader_params.get('username', None) is not None:
2053             useremail = downloader_params['username']
2054             password = downloader_params['password']
2055         elif downloader_params.get('usenetrc', False):
2056             try:
2057                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2058                 if info is not None:
2059                     useremail = info[0]
2060                     password = info[2]
2061                 else:
2062                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2063             except (IOError, netrc.NetrcParseError) as err:
2064                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2065                 return
2066
2067         if useremail is None:
2068             return
2069
2070         # Log in
2071         login_form = {
2072             'email': useremail,
2073             'pass': password,
2074             'login': 'Log+In'
2075             }
2076         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2077         try:
2078             self.report_login()
2079             login_results = compat_urllib_request.urlopen(request).read()
2080             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2081                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2082                 return
2083         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2084             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2085             return
2086
2087     def _real_extract(self, url):
2088         mobj = re.match(self._VALID_URL, url)
2089         if mobj is None:
2090             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2091             return
2092         video_id = mobj.group('ID')
2093
2094         # Get video webpage
2095         self.report_video_webpage_download(video_id)
2096         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2097         try:
2098             page = compat_urllib_request.urlopen(request)
2099             video_webpage = page.read()
2100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2102             return
2103
2104         # Start extracting information
2105         self.report_information_extraction(video_id)
2106
2107         # Extract information
2108         video_info = self._parse_page(video_webpage)
2109
2110         # uploader
2111         if 'owner' not in video_info:
2112             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2113             return
2114         video_uploader = video_info['owner']
2115
2116         # title
2117         if 'title' not in video_info:
2118             self._downloader.trouble(u'ERROR: unable to extract video title')
2119             return
2120         video_title = video_info['title']
2121         video_title = video_title.decode('utf-8')
2122
2123         # thumbnail image
2124         if 'thumbnail' not in video_info:
2125             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2126             video_thumbnail = ''
2127         else:
2128             video_thumbnail = video_info['thumbnail']
2129
2130         # upload date
2131         upload_date = None
2132         if 'upload_date' in video_info:
2133             upload_time = video_info['upload_date']
2134             timetuple = email.utils.parsedate_tz(upload_time)
2135             if timetuple is not None:
2136                 try:
2137                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2138                 except:
2139                     pass
2140
2141         # description
2142         video_description = video_info.get('description', 'No description available.')
2143
2144         url_map = video_info['video_urls']
2145         if url_map:
2146             # Decide which formats to download
2147             req_format = self._downloader.params.get('format', None)
2148             format_limit = self._downloader.params.get('format_limit', None)
2149
2150             if format_limit is not None and format_limit in self._available_formats:
2151                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2152             else:
2153                 format_list = self._available_formats
2154             existing_formats = [x for x in format_list if x in url_map]
2155             if len(existing_formats) == 0:
2156                 self._downloader.trouble(u'ERROR: no known formats available for video')
2157                 return
2158             if req_format is None:
2159                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2160             elif req_format == 'worst':
2161                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2162             elif req_format == '-1':
2163                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2164             else:
2165                 # Specific format
2166                 if req_format not in url_map:
2167                     self._downloader.trouble(u'ERROR: requested format not available')
2168                     return
2169                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2170
2171         results = []
2172         for format_param, video_real_url in video_url_list:
2173             # Extension
2174             video_extension = self._video_extensions.get(format_param, 'mp4')
2175
2176             results.append({
2177                 'id':       video_id.decode('utf-8'),
2178                 'url':      video_real_url.decode('utf-8'),
2179                 'uploader': video_uploader.decode('utf-8'),
2180                 'upload_date':  upload_date,
2181                 'title':    video_title,
2182                 'ext':      video_extension.decode('utf-8'),
2183                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2184                 'thumbnail':    video_thumbnail.decode('utf-8'),
2185                 'description':  video_description.decode('utf-8'),
2186             })
2187         return results
2188
2189 class BlipTVIE(InfoExtractor):
2190     """Information extractor for blip.tv"""
2191
2192     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2193     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2194     IE_NAME = u'blip.tv'
2195
2196     def report_extraction(self, file_id):
2197         """Report information extraction."""
2198         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2199
2200     def report_direct_download(self, title):
2201         """Report information extraction."""
2202         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2203
2204     def _real_extract(self, url):
2205         mobj = re.match(self._VALID_URL, url)
2206         if mobj is None:
2207             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2208             return
2209
2210         if '?' in url:
2211             cchar = '&'
2212         else:
2213             cchar = '?'
2214         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215         request = compat_urllib_request.Request(json_url)
2216         self.report_extraction(mobj.group(1))
2217         info = None
2218         try:
2219             urlh = compat_urllib_request.urlopen(request)
2220             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2221                 basename = url.split('/')[-1]
2222                 title,ext = os.path.splitext(basename)
2223                 title = title.decode('UTF-8')
2224                 ext = ext.replace('.', '')
2225                 self.report_direct_download(title)
2226                 info = {
2227                     'id': title,
2228                     'url': url,
2229                     'uploader': None,
2230                     'upload_date': None,
2231                     'title': title,
2232                     'ext': ext,
2233                     'urlhandle': urlh
2234                 }
2235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2236             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2237             return
2238         if info is None: # Regular URL
2239             try:
2240                 json_code_bytes = urlh.read()
2241                 json_code = json_code_bytes.decode('utf-8')
2242             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2244                 return
2245
2246             try:
2247                 json_data = json.loads(json_code)
2248                 if 'Post' in json_data:
2249                     data = json_data['Post']
2250                 else:
2251                     data = json_data
2252
2253                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254                 video_url = data['media']['url']
2255                 umobj = re.match(self._URL_EXT, video_url)
2256                 if umobj is None:
2257                     raise ValueError('Can not determine filename extension')
2258                 ext = umobj.group(1)
2259
2260                 info = {
2261                     'id': data['item_id'],
2262                     'url': video_url,
2263                     'uploader': data['display_name'],
2264                     'upload_date': upload_date,
2265                     'title': data['title'],
2266                     'ext': ext,
2267                     'format': data['media']['mimeType'],
2268                     'thumbnail': data['thumbnailUrl'],
2269                     'description': data['description'],
2270                     'player_url': data['embedUrl']
2271                 }
2272             except (ValueError,KeyError) as err:
2273                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2274                 return
2275
2276         std_headers['User-Agent'] = 'iTunes/10.6.1'
2277         return [info]
2278
2279
2280 class MyVideoIE(InfoExtractor):
2281     """Information Extractor for myvideo.de."""
2282
2283     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284     IE_NAME = u'myvideo'
2285
2286     def __init__(self, downloader=None):
2287         InfoExtractor.__init__(self, downloader)
2288
2289     def report_extraction(self, video_id):
2290         """Report information extraction."""
2291         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2292
2293     def _real_extract(self,url):
2294         mobj = re.match(self._VALID_URL, url)
2295         if mobj is None:
2296             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2297             return
2298
2299         video_id = mobj.group(1)
2300
2301         # Get video webpage
2302         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2303         webpage = self._download_webpage(webpage_url, video_id)
2304
2305         self.report_extraction(video_id)
2306         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2307                  webpage)
2308         if mobj is None:
2309             self._downloader.trouble(u'ERROR: unable to extract media URL')
2310             return
2311         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2312
2313         mobj = re.search('<title>([^<]+)</title>', webpage)
2314         if mobj is None:
2315             self._downloader.trouble(u'ERROR: unable to extract title')
2316             return
2317
2318         video_title = mobj.group(1)
2319
2320         return [{
2321             'id':       video_id,
2322             'url':      video_url,
2323             'uploader': None,
2324             'upload_date':  None,
2325             'title':    video_title,
2326             'ext':      u'flv',
2327         }]
2328
2329 class ComedyCentralIE(InfoExtractor):
2330     """Information extractor for The Daily Show and Colbert Report """
2331
2332     # urls can be abbreviations like :thedailyshow or :colbert
2333     # urls for episodes like:
2334     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338                       |(https?://)?(www\.)?
2339                           (?P<showname>thedailyshow|colbertnation)\.com/
2340                          (full-episodes/(?P<episode>.*)|
2341                           (?P<clip>
2342                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2344                      $"""
2345     IE_NAME = u'comedycentral'
2346
2347     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2348
2349     _video_extensions = {
2350         '3500': 'mp4',
2351         '2200': 'mp4',
2352         '1700': 'mp4',
2353         '1200': 'mp4',
2354         '750': 'mp4',
2355         '400': 'mp4',
2356     }
2357     _video_dimensions = {
2358         '3500': '1280x720',
2359         '2200': '960x540',
2360         '1700': '768x432',
2361         '1200': '640x360',
2362         '750': '512x288',
2363         '400': '384x216',
2364     }
2365
2366     def suitable(self, url):
2367         """Receives a URL and returns True if suitable for this IE."""
2368         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2369
2370     def report_extraction(self, episode_id):
2371         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2372
2373     def report_config_download(self, episode_id):
2374         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2375
2376     def report_index_download(self, episode_id):
2377         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2378
2379     def report_player_url(self, episode_id):
2380         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2381
2382
2383     def _print_formats(self, formats):
2384         print('Available formats:')
2385         for x in formats:
2386             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2387
2388
2389     def _real_extract(self, url):
2390         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2391         if mobj is None:
2392             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2393             return
2394
2395         if mobj.group('shortname'):
2396             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2397                 url = u'http://www.thedailyshow.com/full-episodes/'
2398             else:
2399                 url = u'http://www.colbertnation.com/full-episodes/'
2400             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2401             assert mobj is not None
2402
2403         if mobj.group('clip'):
2404             if mobj.group('showname') == 'thedailyshow':
2405                 epTitle = mobj.group('tdstitle')
2406             else:
2407                 epTitle = mobj.group('cntitle')
2408             dlNewest = False
2409         else:
2410             dlNewest = not mobj.group('episode')
2411             if dlNewest:
2412                 epTitle = mobj.group('showname')
2413             else:
2414                 epTitle = mobj.group('episode')
2415
2416         req = compat_urllib_request.Request(url)
2417         self.report_extraction(epTitle)
2418         try:
2419             htmlHandle = compat_urllib_request.urlopen(req)
2420             html = htmlHandle.read()
2421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2422             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2423             return
2424         if dlNewest:
2425             url = htmlHandle.geturl()
2426             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2427             if mobj is None:
2428                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2429                 return
2430             if mobj.group('episode') == '':
2431                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2432                 return
2433             epTitle = mobj.group('episode')
2434
2435         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2436
2437         if len(mMovieParams) == 0:
2438             # The Colbert Report embeds the information in a without
2439             # a URL prefix; so extract the alternate reference
2440             # and then add the URL prefix manually.
2441
2442             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2443             if len(altMovieParams) == 0:
2444                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2445                 return
2446             else:
2447                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2448
2449         playerUrl_raw = mMovieParams[0][0]
2450         self.report_player_url(epTitle)
2451         try:
2452             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2453             playerUrl = urlHandle.geturl()
2454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2456             return
2457
2458         uri = mMovieParams[0][1]
2459         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460         self.report_index_download(epTitle)
2461         try:
2462             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2465             return
2466
2467         results = []
2468
2469         idoc = xml.etree.ElementTree.fromstring(indexXml)
2470         itemEls = idoc.findall('.//item')
2471         for itemEl in itemEls:
2472             mediaId = itemEl.findall('./guid')[0].text
2473             shortMediaId = mediaId.split(':')[-1]
2474             showId = mediaId.split(':')[-2].replace('.com', '')
2475             officialTitle = itemEl.findall('./title')[0].text
2476             officialDate = itemEl.findall('./pubDate')[0].text
2477
2478             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479                         compat_urllib_parse.urlencode({'uri': mediaId}))
2480             configReq = compat_urllib_request.Request(configUrl)
2481             self.report_config_download(epTitle)
2482             try:
2483                 configXml = compat_urllib_request.urlopen(configReq).read()
2484             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2486                 return
2487
2488             cdoc = xml.etree.ElementTree.fromstring(configXml)
2489             turls = []
2490             for rendition in cdoc.findall('.//rendition'):
2491                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2492                 turls.append(finfo)
2493
2494             if len(turls) == 0:
2495                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2496                 continue
2497
2498             if self._downloader.params.get('listformats', None):
2499                 self._print_formats([i[0] for i in turls])
2500                 return
2501
2502             # For now, just pick the highest bitrate
2503             format,video_url = turls[-1]
2504
2505             # Get the format arg from the arg stream
2506             req_format = self._downloader.params.get('format', None)
2507
2508             # Select format if we can find one
2509             for f,v in turls:
2510                 if f == req_format:
2511                     format, video_url = f, v
2512                     break
2513
2514             # Patch to download from alternative CDN, which does not
2515             # break on current RTMPDump builds
2516             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2517             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2518
2519             if video_url.startswith(broken_cdn):
2520                 video_url = video_url.replace(broken_cdn, better_cdn)
2521
2522             effTitle = showId + u'-' + epTitle
2523             info = {
2524                 'id': shortMediaId,
2525                 'url': video_url,
2526                 'uploader': showId,
2527                 'upload_date': officialDate,
2528                 'title': effTitle,
2529                 'ext': 'mp4',
2530                 'format': format,
2531                 'thumbnail': None,
2532                 'description': officialTitle,
2533                 'player_url': None #playerUrl
2534             }
2535
2536             results.append(info)
2537
2538         return results
2539
2540
2541 class EscapistIE(InfoExtractor):
2542     """Information extractor for The Escapist """
2543
2544     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2545     IE_NAME = u'escapist'
2546
2547     def report_extraction(self, showName):
2548         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2549
2550     def report_config_download(self, showName):
2551         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2552
2553     def _real_extract(self, url):
2554         mobj = re.match(self._VALID_URL, url)
2555         if mobj is None:
2556             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2557             return
2558         showName = mobj.group('showname')
2559         videoId = mobj.group('episode')
2560
2561         self.report_extraction(showName)
2562         try:
2563             webPage = compat_urllib_request.urlopen(url)
2564             webPageBytes = webPage.read()
2565             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2566             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2569             return
2570
2571         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2572         description = unescapeHTML(descMatch.group(1))
2573         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2574         imgUrl = unescapeHTML(imgMatch.group(1))
2575         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2576         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2577         configUrlMatch = re.search('config=(.*)$', playerUrl)
2578         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2579
2580         self.report_config_download(showName)
2581         try:
2582             configJSON = compat_urllib_request.urlopen(configUrl)
2583             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2584             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2587             return
2588
2589         # Technically, it's JavaScript, not JSON
2590         configJSON = configJSON.replace("'", '"')
2591
2592         try:
2593             config = json.loads(configJSON)
2594         except (ValueError,) as err:
2595             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2596             return
2597
2598         playlist = config['playlist']
2599         videoUrl = playlist[1]['url']
2600
2601         info = {
2602             'id': videoId,
2603             'url': videoUrl,
2604             'uploader': showName,
2605             'upload_date': None,
2606             'title': showName,
2607             'ext': 'flv',
2608             'thumbnail': imgUrl,
2609             'description': description,
2610             'player_url': playerUrl,
2611         }
2612
2613         return [info]
2614
2615
2616 class CollegeHumorIE(InfoExtractor):
2617     """Information extractor for collegehumor.com"""
2618
2619     _WORKING = False
2620     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2621     IE_NAME = u'collegehumor'
2622
2623     def report_manifest(self, video_id):
2624         """Report information extraction."""
2625         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2626
2627     def report_extraction(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2630
2631     def _real_extract(self, url):
2632         mobj = re.match(self._VALID_URL, url)
2633         if mobj is None:
2634             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2635             return
2636         video_id = mobj.group('videoid')
2637
2638         info = {
2639             'id': video_id,
2640             'uploader': None,
2641             'upload_date': None,
2642         }
2643
2644         self.report_extraction(video_id)
2645         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2646         try:
2647             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2648         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2649             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2650             return
2651
2652         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2653         try:
2654             videoNode = mdoc.findall('./video')[0]
2655             info['description'] = videoNode.findall('./description')[0].text
2656             info['title'] = videoNode.findall('./caption')[0].text
2657             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2658             manifest_url = videoNode.findall('./file')[0].text
2659         except IndexError:
2660             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2661             return
2662
2663         manifest_url += '?hdcore=2.10.3'
2664         self.report_manifest(video_id)
2665         try:
2666             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2667         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2669             return
2670
2671         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2672         try:
2673             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2674             node_id = media_node.attrib['url']
2675             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2676         except IndexError as err:
2677             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2678             return
2679
2680         url_pr = compat_urllib_parse_urlparse(manifest_url)
2681         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2682
2683         info['url'] = url
2684         info['ext'] = 'f4f'
2685         return [info]
2686
2687
2688 class XVideosIE(InfoExtractor):
2689     """Information extractor for xvideos.com"""
2690
2691     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2692     IE_NAME = u'xvideos'
2693
2694     def report_extraction(self, video_id):
2695         """Report information extraction."""
2696         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2697
2698     def _real_extract(self, url):
2699         mobj = re.match(self._VALID_URL, url)
2700         if mobj is None:
2701             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2702             return
2703         video_id = mobj.group(1)
2704
2705         webpage = self._download_webpage(url, video_id)
2706
2707         self.report_extraction(video_id)
2708
2709
2710         # Extract video URL
2711         mobj = re.search(r'flv_url=(.+?)&', webpage)
2712         if mobj is None:
2713             self._downloader.trouble(u'ERROR: unable to extract video url')
2714             return
2715         video_url = compat_urllib_parse.unquote(mobj.group(1))
2716
2717
2718         # Extract title
2719         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2720         if mobj is None:
2721             self._downloader.trouble(u'ERROR: unable to extract video title')
2722             return
2723         video_title = mobj.group(1)
2724
2725
2726         # Extract video thumbnail
2727         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2728         if mobj is None:
2729             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2730             return
2731         video_thumbnail = mobj.group(0)
2732
2733         info = {
2734             'id': video_id,
2735             'url': video_url,
2736             'uploader': None,
2737             'upload_date': None,
2738             'title': video_title,
2739             'ext': 'flv',
2740             'thumbnail': video_thumbnail,
2741             'description': None,
2742         }
2743
2744         return [info]
2745
2746
2747 class SoundcloudIE(InfoExtractor):
2748     """Information extractor for soundcloud.com
2749        To access the media, the uid of the song and a stream token
2750        must be extracted from the page source and the script must make
2751        a request to media.soundcloud.com/crossdomain.xml. Then
2752        the media can be grabbed by requesting from an url composed
2753        of the stream token and uid
2754      """
2755
2756     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2757     IE_NAME = u'soundcloud'
2758
2759     def __init__(self, downloader=None):
2760         InfoExtractor.__init__(self, downloader)
2761
2762     def report_resolve(self, video_id):
2763         """Report information extraction."""
2764         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2765
2766     def report_extraction(self, video_id):
2767         """Report information extraction."""
2768         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2769
2770     def _real_extract(self, url):
2771         mobj = re.match(self._VALID_URL, url)
2772         if mobj is None:
2773             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774             return
2775
2776         # extract uploader (which is in the url)
2777         uploader = mobj.group(1)
2778         # extract simple title (uploader + slug of song title)
2779         slug_title =  mobj.group(2)
2780         simple_title = uploader + u'-' + slug_title
2781
2782         self.report_resolve('%s/%s' % (uploader, slug_title))
2783
2784         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2785         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2786         request = compat_urllib_request.Request(resolv_url)
2787         try:
2788             info_json_bytes = compat_urllib_request.urlopen(request).read()
2789             info_json = info_json_bytes.decode('utf-8')
2790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2791             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2792             return
2793
2794         info = json.loads(info_json)
2795         video_id = info['id']
2796         self.report_extraction('%s/%s' % (uploader, slug_title))
2797
2798         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2799         request = compat_urllib_request.Request(streams_url)
2800         try:
2801             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2802             stream_json = stream_json_bytes.decode('utf-8')
2803         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2804             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2805             return
2806
2807         streams = json.loads(stream_json)
2808         mediaURL = streams['http_mp3_128_url']
2809
2810         return [{
2811             'id':       info['id'],
2812             'url':      mediaURL,
2813             'uploader': info['user']['username'],
2814             'upload_date':  info['created_at'],
2815             'title':    info['title'],
2816             'ext':      u'mp3',
2817             'description': info['description'],
2818         }]
2819
2820
2821 class InfoQIE(InfoExtractor):
2822     """Information extractor for infoq.com"""
2823
2824     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2825     IE_NAME = u'infoq'
2826
2827     def report_webpage(self, video_id):
2828         """Report information extraction."""
2829         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2830
2831     def report_extraction(self, video_id):
2832         """Report information extraction."""
2833         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2834
2835     def _real_extract(self, url):
2836         mobj = re.match(self._VALID_URL, url)
2837         if mobj is None:
2838             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2839             return
2840
2841         self.report_webpage(url)
2842
2843         request = compat_urllib_request.Request(url)
2844         try:
2845             webpage = compat_urllib_request.urlopen(request).read()
2846         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2848             return
2849
2850         self.report_extraction(url)
2851
2852
2853         # Extract video URL
2854         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2855         if mobj is None:
2856             self._downloader.trouble(u'ERROR: unable to extract video url')
2857             return
2858         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2859
2860
2861         # Extract title
2862         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2863         if mobj is None:
2864             self._downloader.trouble(u'ERROR: unable to extract video title')
2865             return
2866         video_title = mobj.group(1).decode('utf-8')
2867
2868         # Extract description
2869         video_description = u'No description available.'
2870         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2871         if mobj is not None:
2872             video_description = mobj.group(1).decode('utf-8')
2873
2874         video_filename = video_url.split('/')[-1]
2875         video_id, extension = video_filename.split('.')
2876
2877         info = {
2878             'id': video_id,
2879             'url': video_url,
2880             'uploader': None,
2881             'upload_date': None,
2882             'title': video_title,
2883             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2884             'thumbnail': None,
2885             'description': video_description,
2886         }
2887
2888         return [info]
2889
2890 class MixcloudIE(InfoExtractor):
2891     """Information extractor for www.mixcloud.com"""
2892
2893     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2894     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2895     IE_NAME = u'mixcloud'
2896
2897     def __init__(self, downloader=None):
2898         InfoExtractor.__init__(self, downloader)
2899
2900     def report_download_json(self, file_id):
2901         """Report JSON download."""
2902         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2903
2904     def report_extraction(self, file_id):
2905         """Report information extraction."""
2906         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2907
2908     def get_urls(self, jsonData, fmt, bitrate='best'):
2909         """Get urls from 'audio_formats' section in json"""
2910         file_url = None
2911         try:
2912             bitrate_list = jsonData[fmt]
2913             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2914                 bitrate = max(bitrate_list) # select highest
2915
2916             url_list = jsonData[fmt][bitrate]
2917         except TypeError: # we have no bitrate info.
2918             url_list = jsonData[fmt]
2919         return url_list
2920
2921     def check_urls(self, url_list):
2922         """Returns 1st active url from list"""
2923         for url in url_list:
2924             try:
2925                 compat_urllib_request.urlopen(url)
2926                 return url
2927             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2928                 url = None
2929
2930         return None
2931
2932     def _print_formats(self, formats):
2933         print('Available formats:')
2934         for fmt in formats.keys():
2935             for b in formats[fmt]:
2936                 try:
2937                     ext = formats[fmt][b][0]
2938                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2939                 except TypeError: # we have no bitrate info
2940                     ext = formats[fmt][0]
2941                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2942                     break
2943
2944     def _real_extract(self, url):
2945         mobj = re.match(self._VALID_URL, url)
2946         if mobj is None:
2947             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2948             return
2949         # extract uploader & filename from url
2950         uploader = mobj.group(1).decode('utf-8')
2951         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2952
2953         # construct API request
2954         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2955         # retrieve .json file with links to files
2956         request = compat_urllib_request.Request(file_url)
2957         try:
2958             self.report_download_json(file_url)
2959             jsonData = compat_urllib_request.urlopen(request).read()
2960         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2961             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2962             return
2963
2964         # parse JSON
2965         json_data = json.loads(jsonData)
2966         player_url = json_data['player_swf_url']
2967         formats = dict(json_data['audio_formats'])
2968
2969         req_format = self._downloader.params.get('format', None)
2970         bitrate = None
2971
2972         if self._downloader.params.get('listformats', None):
2973             self._print_formats(formats)
2974             return
2975
2976         if req_format is None or req_format == 'best':
2977             for format_param in formats.keys():
2978                 url_list = self.get_urls(formats, format_param)
2979                 # check urls
2980                 file_url = self.check_urls(url_list)
2981                 if file_url is not None:
2982                     break # got it!
2983         else:
2984             if req_format not in formats:
2985                 self._downloader.trouble(u'ERROR: format is not available')
2986                 return
2987
2988             url_list = self.get_urls(formats, req_format)
2989             file_url = self.check_urls(url_list)
2990             format_param = req_format
2991
2992         return [{
2993             'id': file_id.decode('utf-8'),
2994             'url': file_url.decode('utf-8'),
2995             'uploader': uploader.decode('utf-8'),
2996             'upload_date': None,
2997             'title': json_data['name'],
2998             'ext': file_url.split('.')[-1].decode('utf-8'),
2999             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3000             'thumbnail': json_data['thumbnail_url'],
3001             'description': json_data['description'],
3002             'player_url': player_url.decode('utf-8'),
3003         }]
3004
3005 class StanfordOpenClassroomIE(InfoExtractor):
3006     """Information extractor for Stanford's Open ClassRoom"""
3007
3008     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3009     IE_NAME = u'stanfordoc'
3010
3011     def report_download_webpage(self, objid):
3012         """Report information extraction."""
3013         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3014
3015     def report_extraction(self, video_id):
3016         """Report information extraction."""
3017         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3018
3019     def _real_extract(self, url):
3020         mobj = re.match(self._VALID_URL, url)
3021         if mobj is None:
3022             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3023             return
3024
3025         if mobj.group('course') and mobj.group('video'): # A specific video
3026             course = mobj.group('course')
3027             video = mobj.group('video')
3028             info = {
3029                 'id': course + '_' + video,
3030                 'uploader': None,
3031                 'upload_date': None,
3032             }
3033
3034             self.report_extraction(info['id'])
3035             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3036             xmlUrl = baseUrl + video + '.xml'
3037             try:
3038                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3039             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3040                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3041                 return
3042             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3043             try:
3044                 info['title'] = mdoc.findall('./title')[0].text
3045                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3046             except IndexError:
3047                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3048                 return
3049             info['ext'] = info['url'].rpartition('.')[2]
3050             return [info]
3051         elif mobj.group('course'): # A course page
3052             course = mobj.group('course')
3053             info = {
3054                 'id': course,
3055                 'type': 'playlist',
3056                 'uploader': None,
3057                 'upload_date': None,
3058             }
3059
3060             self.report_download_webpage(info['id'])
3061             try:
3062                 coursepage = compat_urllib_request.urlopen(url).read()
3063             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3064                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3065                 return
3066
3067             m = re.search('<h1>([^<]+)</h1>', coursepage)
3068             if m:
3069                 info['title'] = unescapeHTML(m.group(1))
3070             else:
3071                 info['title'] = info['id']
3072
3073             m = re.search('<description>([^<]+)</description>', coursepage)
3074             if m:
3075                 info['description'] = unescapeHTML(m.group(1))
3076
3077             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3078             info['list'] = [
3079                 {
3080                     'type': 'reference',
3081                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3082                 }
3083                     for vpage in links]
3084             results = []
3085             for entry in info['list']:
3086                 assert entry['type'] == 'reference'
3087                 results += self.extract(entry['url'])
3088             return results
3089
3090         else: # Root page
3091             info = {
3092                 'id': 'Stanford OpenClassroom',
3093                 'type': 'playlist',
3094                 'uploader': None,
3095                 'upload_date': None,
3096             }
3097
3098             self.report_download_webpage(info['id'])
3099             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3100             try:
3101                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3102             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3104                 return
3105
3106             info['title'] = info['id']
3107
3108             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3109             info['list'] = [
3110                 {
3111                     'type': 'reference',
3112                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3113                 }
3114                     for cpage in links]
3115
3116             results = []
3117             for entry in info['list']:
3118                 assert entry['type'] == 'reference'
3119                 results += self.extract(entry['url'])
3120             return results
3121
3122 class MTVIE(InfoExtractor):
3123     """Information extractor for MTV.com"""
3124
3125     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3126     IE_NAME = u'mtv'
3127
3128     def report_extraction(self, video_id):
3129         """Report information extraction."""
3130         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3131
3132     def _real_extract(self, url):
3133         mobj = re.match(self._VALID_URL, url)
3134         if mobj is None:
3135             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3136             return
3137         if not mobj.group('proto'):
3138             url = 'http://' + url
3139         video_id = mobj.group('videoid')
3140
3141         webpage = self._download_webpage(url, video_id)
3142
3143         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3144         if mobj is None:
3145             self._downloader.trouble(u'ERROR: unable to extract song name')
3146             return
3147         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3148         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3149         if mobj is None:
3150             self._downloader.trouble(u'ERROR: unable to extract performer')
3151             return
3152         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3153         video_title = performer + ' - ' + song_name
3154
3155         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3156         if mobj is None:
3157             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3158             return
3159         mtvn_uri = mobj.group(1)
3160
3161         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3162         if mobj is None:
3163             self._downloader.trouble(u'ERROR: unable to extract content id')
3164             return
3165         content_id = mobj.group(1)
3166
3167         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3168         self.report_extraction(video_id)
3169         request = compat_urllib_request.Request(videogen_url)
3170         try:
3171             metadataXml = compat_urllib_request.urlopen(request).read()
3172         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3173             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3174             return
3175
3176         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3177         renditions = mdoc.findall('.//rendition')
3178
3179         # For now, always pick the highest quality.
3180         rendition = renditions[-1]
3181
3182         try:
3183             _,_,ext = rendition.attrib['type'].partition('/')
3184             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3185             video_url = rendition.find('./src').text
3186         except KeyError:
3187             self._downloader.trouble('Invalid rendition field.')
3188             return
3189
3190         info = {
3191             'id': video_id,
3192             'url': video_url,
3193             'uploader': performer,
3194             'upload_date': None,
3195             'title': video_title,
3196             'ext': ext,
3197             'format': format,
3198         }
3199
3200         return [info]
3201
3202
3203 class YoukuIE(InfoExtractor):
3204     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3205
3206     def report_download_webpage(self, file_id):
3207         """Report webpage download."""
3208         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3209
3210     def report_extraction(self, file_id):
3211         """Report information extraction."""
3212         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3213
3214     def _gen_sid(self):
3215         nowTime = int(time.time() * 1000)
3216         random1 = random.randint(1000,1998)
3217         random2 = random.randint(1000,9999)
3218
3219         return "%d%d%d" %(nowTime,random1,random2)
3220
3221     def _get_file_ID_mix_string(self, seed):
3222         mixed = []
3223         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3224         seed = float(seed)
3225         for i in range(len(source)):
3226             seed  =  (seed * 211 + 30031 ) % 65536
3227             index  =  math.floor(seed / 65536 * len(source) )
3228             mixed.append(source[int(index)])
3229             source.remove(source[int(index)])
3230         #return ''.join(mixed)
3231         return mixed
3232
3233     def _get_file_id(self, fileId, seed):
3234         mixed = self._get_file_ID_mix_string(seed)
3235         ids = fileId.split('*')
3236         realId = []
3237         for ch in ids:
3238             if ch:
3239                 realId.append(mixed[int(ch)])
3240         return ''.join(realId)
3241
3242     def _real_extract(self, url):
3243         mobj = re.match(self._VALID_URL, url)
3244         if mobj is None:
3245             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3246             return
3247         video_id = mobj.group('ID')
3248
3249         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3250
3251         request = compat_urllib_request.Request(info_url, None, std_headers)
3252         try:
3253             self.report_download_webpage(video_id)
3254             jsondata = compat_urllib_request.urlopen(request).read()
3255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3256             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3257             return
3258
3259         self.report_extraction(video_id)
3260         try:
3261             jsonstr = jsondata.decode('utf-8')
3262             config = json.loads(jsonstr)
3263
3264             video_title =  config['data'][0]['title']
3265             seed = config['data'][0]['seed']
3266
3267             format = self._downloader.params.get('format', None)
3268             supported_format = list(config['data'][0]['streamfileids'].keys())
3269
3270             if format is None or format == 'best':
3271                 if 'hd2' in supported_format:
3272                     format = 'hd2'
3273                 else:
3274                     format = 'flv'
3275                 ext = u'flv'
3276             elif format == 'worst':
3277                 format = 'mp4'
3278                 ext = u'mp4'
3279             else:
3280                 format = 'flv'
3281                 ext = u'flv'
3282
3283
3284             fileid = config['data'][0]['streamfileids'][format]
3285             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3286         except (UnicodeDecodeError, ValueError, KeyError):
3287             self._downloader.trouble(u'ERROR: unable to extract info section')
3288             return
3289
3290         files_info=[]
3291         sid = self._gen_sid()
3292         fileid = self._get_file_id(fileid, seed)
3293
3294         #column 8,9 of fileid represent the segment number
3295         #fileid[7:9] should be changed
3296         for index, key in enumerate(keys):
3297
3298             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3299             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3300
3301             info = {
3302                 'id': '%s_part%02d' % (video_id, index),
3303                 'url': download_url,
3304                 'uploader': None,
3305                 'upload_date': None,
3306                 'title': video_title,
3307                 'ext': ext,
3308             }
3309             files_info.append(info)
3310
3311         return files_info
3312
3313
3314 class XNXXIE(InfoExtractor):
3315     """Information extractor for xnxx.com"""
3316
3317     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3318     IE_NAME = u'xnxx'
3319     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3320     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3321     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3322
3323     def report_webpage(self, video_id):
3324         """Report information extraction"""
3325         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3326
3327     def report_extraction(self, video_id):
3328         """Report information extraction"""
3329         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3330
3331     def _real_extract(self, url):
3332         mobj = re.match(self._VALID_URL, url)
3333         if mobj is None:
3334             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3335             return
3336         video_id = mobj.group(1)
3337
3338         self.report_webpage(video_id)
3339
3340         # Get webpage content
3341         try:
3342             webpage_bytes = compat_urllib_request.urlopen(url).read()
3343             webpage = webpage_bytes.decode('utf-8')
3344         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3345             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3346             return
3347
3348         result = re.search(self.VIDEO_URL_RE, webpage)
3349         if result is None:
3350             self._downloader.trouble(u'ERROR: unable to extract video url')
3351             return
3352         video_url = compat_urllib_parse.unquote(result.group(1))
3353
3354         result = re.search(self.VIDEO_TITLE_RE, webpage)
3355         if result is None:
3356             self._downloader.trouble(u'ERROR: unable to extract video title')
3357             return
3358         video_title = result.group(1)
3359
3360         result = re.search(self.VIDEO_THUMB_RE, webpage)
3361         if result is None:
3362             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3363             return
3364         video_thumbnail = result.group(1)
3365
3366         return [{
3367             'id': video_id,
3368             'url': video_url,
3369             'uploader': None,
3370             'upload_date': None,
3371             'title': video_title,
3372             'ext': 'flv',
3373             'thumbnail': video_thumbnail,
3374             'description': None,
3375         }]
3376
3377
3378 class GooglePlusIE(InfoExtractor):
3379     """Information extractor for plus.google.com."""
3380
3381     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3382     IE_NAME = u'plus.google'
3383
3384     def __init__(self, downloader=None):
3385         InfoExtractor.__init__(self, downloader)
3386
3387     def report_extract_entry(self, url):
3388         """Report downloading extry"""
3389         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3390
3391     def report_date(self, upload_date):
3392         """Report downloading extry"""
3393         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3394
3395     def report_uploader(self, uploader):
3396         """Report downloading extry"""
3397         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3398
3399     def report_title(self, video_title):
3400         """Report downloading extry"""
3401         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3402
3403     def report_extract_vid_page(self, video_page):
3404         """Report information extraction."""
3405         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3406
3407     def _real_extract(self, url):
3408         # Extract id from URL
3409         mobj = re.match(self._VALID_URL, url)
3410         if mobj is None:
3411             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3412             return
3413
3414         post_url = mobj.group(0)
3415         video_id = mobj.group(1)
3416
3417         video_extension = 'flv'
3418
3419         # Step 1, Retrieve post webpage to extract further information
3420         self.report_extract_entry(post_url)
3421         request = compat_urllib_request.Request(post_url)
3422         try:
3423             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3424         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3426             return
3427
3428         # Extract update date
3429         upload_date = None
3430         pattern = 'title="Timestamp">(.*?)</a>'
3431         mobj = re.search(pattern, webpage)
3432         if mobj:
3433             upload_date = mobj.group(1)
3434             # Convert timestring to a format suitable for filename
3435             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3436             upload_date = upload_date.strftime('%Y%m%d')
3437         self.report_date(upload_date)
3438
3439         # Extract uploader
3440         uploader = None
3441         pattern = r'rel\="author".*?>(.*?)</a>'
3442         mobj = re.search(pattern, webpage)
3443         if mobj:
3444             uploader = mobj.group(1)
3445         self.report_uploader(uploader)
3446
3447         # Extract title
3448         # Get the first line for title
3449         video_title = u'NA'
3450         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3451         mobj = re.search(pattern, webpage)
3452         if mobj:
3453             video_title = mobj.group(1)
3454         self.report_title(video_title)
3455
3456         # Step 2, Stimulate clicking the image box to launch video
3457         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3458         mobj = re.search(pattern, webpage)
3459         if mobj is None:
3460             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3461
3462         video_page = mobj.group(1)
3463         request = compat_urllib_request.Request(video_page)
3464         try:
3465             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3466         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3467             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3468             return
3469         self.report_extract_vid_page(video_page)
3470
3471
3472         # Extract video links on video page
3473         """Extract video links of all sizes"""
3474         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3475         mobj = re.findall(pattern, webpage)
3476         if len(mobj) == 0:
3477             self._downloader.trouble(u'ERROR: unable to extract video links')
3478
3479         # Sort in resolution
3480         links = sorted(mobj)
3481
3482         # Choose the lowest of the sort, i.e. highest resolution
3483         video_url = links[-1]
3484         # Only get the url. The resolution part in the tuple has no use anymore
3485         video_url = video_url[-1]
3486         # Treat escaped \u0026 style hex
3487         try:
3488             video_url = video_url.decode("unicode_escape")
3489         except AttributeError: # Python 3
3490             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3491
3492
3493         return [{
3494             'id':       video_id,
3495             'url':      video_url,
3496             'uploader': uploader,
3497             'upload_date':  upload_date,
3498             'title':    video_title,
3499             'ext':      video_extension,
3500         }]
3501
3502 class NBAIE(InfoExtractor):
3503     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3504     IE_NAME = u'nba'
3505
3506     def _real_extract(self, url):
3507         mobj = re.match(self._VALID_URL, url)
3508         if mobj is None:
3509             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3510             return
3511
3512         video_id = mobj.group(1)
3513         if video_id.endswith('/index.html'):
3514             video_id = video_id[:-len('/index.html')]
3515
3516         webpage = self._download_webpage(url, video_id)
3517
3518         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3519         def _findProp(rexp, default=None):
3520             m = re.search(rexp, webpage)
3521             if m:
3522                 return unescapeHTML(m.group(1))
3523             else:
3524                 return default
3525
3526         shortened_video_id = video_id.rpartition('/')[2]
3527         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3528         info = {
3529             'id': shortened_video_id,
3530             'url': video_url,
3531             'ext': 'mp4',
3532             'title': title,
3533             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3534             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3535         }
3536         return [info]
3537
3538 class JustinTVIE(InfoExtractor):
3539     """Information extractor for justin.tv and twitch.tv"""
3540     # TODO: One broadcast may be split into multiple videos. The key
3541     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3542     # starts at 1 and increases. Can we treat all parts as one video?
3543
3544     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3545         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3546     _JUSTIN_PAGE_LIMIT = 100
3547     IE_NAME = u'justin.tv'
3548
3549     def report_extraction(self, file_id):
3550         """Report information extraction."""
3551         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3552
3553     def report_download_page(self, channel, offset):
3554         """Report attempt to download a single page of videos."""
3555         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3556                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3557
3558     # Return count of items, list of *valid* items
3559     def _parse_page(self, url):
3560         try:
3561             urlh = compat_urllib_request.urlopen(url)
3562             webpage_bytes = urlh.read()
3563             webpage = webpage_bytes.decode('utf-8', 'ignore')
3564         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3565             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3566             return
3567
3568         response = json.loads(webpage)
3569         info = []
3570         for clip in response:
3571             video_url = clip['video_file_url']
3572             if video_url:
3573                 video_extension = os.path.splitext(video_url)[1][1:]
3574                 video_date = re.sub('-', '', clip['created_on'][:10])
3575                 info.append({
3576                     'id': clip['id'],
3577                     'url': video_url,
3578                     'title': clip['title'],
3579                     'uploader': clip.get('user_id', clip.get('channel_id')),
3580                     'upload_date': video_date,
3581                     'ext': video_extension,
3582                 })
3583         return (len(response), info)
3584
3585     def _real_extract(self, url):
3586         mobj = re.match(self._VALID_URL, url)
3587         if mobj is None:
3588             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589             return
3590
3591         api = 'http://api.justin.tv'
3592         video_id = mobj.group(mobj.lastindex)
3593         paged = False
3594         if mobj.lastindex == 1:
3595             paged = True
3596             api += '/channel/archives/%s.json'
3597         else:
3598             api += '/clip/show/%s.json'
3599         api = api % (video_id,)
3600
3601         self.report_extraction(video_id)
3602
3603         info = []
3604         offset = 0
3605         limit = self._JUSTIN_PAGE_LIMIT
3606         while True:
3607             if paged:
3608                 self.report_download_page(video_id, offset)
3609             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3610             page_count, page_info = self._parse_page(page_url)
3611             info.extend(page_info)
3612             if not paged or page_count != limit:
3613                 break
3614             offset += limit
3615         return info
3616
3617 class FunnyOrDieIE(InfoExtractor):
3618     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3619
3620     def _real_extract(self, url):
3621         mobj = re.match(self._VALID_URL, url)
3622         if mobj is None:
3623             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3624             return
3625
3626         video_id = mobj.group('id')
3627         webpage = self._download_webpage(url, video_id)
3628
3629         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3630         if not m:
3631             self._downloader.trouble(u'ERROR: unable to find video information')
3632         video_url = unescapeHTML(m.group('url'))
3633
3634         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3635         if not m:
3636             self._downloader.trouble(u'Cannot find video title')
3637         title = unescapeHTML(m.group('title'))
3638
3639         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3640         if m:
3641             desc = unescapeHTML(m.group('desc'))
3642         else:
3643             desc = None
3644
3645         info = {
3646             'id': video_id,
3647             'url': video_url,
3648             'ext': 'mp4',
3649             'title': title,
3650             'description': desc,
3651         }
3652         return [info]
3653
3654 class TweetReelIE(InfoExtractor):
3655     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3656
3657     def _real_extract(self, url):
3658         mobj = re.match(self._VALID_URL, url)
3659         if mobj is None:
3660             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3661             return
3662
3663         video_id = mobj.group('id')
3664         webpage = self._download_webpage(url, video_id)
3665
3666         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3667         if not m:
3668             self._downloader.trouble(u'ERROR: Cannot find status ID')
3669         status_id = m.group(1)
3670
3671         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3672         if not m:
3673             self._downloader.trouble(u'WARNING: Cannot find description')
3674         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3675
3676         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3677         if not m:
3678             self._downloader.trouble(u'ERROR: Cannot find uploader')
3679         uploader = unescapeHTML(m.group('uploader'))
3680         uploader_id = unescapeHTML(m.group('uploader_id'))
3681
3682         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3683         if not m:
3684             self._downloader.trouble(u'ERROR: Cannot find upload date')
3685         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3686
3687         title = desc
3688         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3689
3690         info = {
3691             'id': video_id,
3692             'url': video_url,
3693             'ext': 'mov',
3694             'title': title,
3695             'description': desc,
3696             'uploader': uploader,
3697             'uploader_id': uploader_id,
3698             'internal_id': status_id,
3699             'upload_date': upload_date
3700         }
3701         return [info]
3702
3703 class SteamIE(InfoExtractor):
3704     _VALID_URL = r"""http://store.steampowered.com/
3705                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3706                 (?P<gameID>\d+)/?
3707                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3708                 """
3709
3710     def suitable(self, url):
3711         """Receives a URL and returns True if suitable for this IE."""
3712         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3713
3714     def _real_extract(self, url):
3715         m = re.match(self._VALID_URL, url, re.VERBOSE)
3716         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3717         gameID = m.group('gameID')
3718         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3719         webpage = self._download_webpage(videourl, gameID)
3720         mweb = re.finditer(urlRE, webpage)
3721         namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3722         titles = list(re.finditer(namesRE, webpage))
3723         videos = []
3724         for vid,vtitle in zip(mweb,titles):
3725             video_id = vid.group('videoID')
3726             title = vtitle.group('videoName')
3727             video_url = vid.group('videoURL')
3728             if not video_url:
3729                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3730             info = {
3731                 'id':video_id,
3732                 'url':video_url,
3733                 'ext': 'flv',
3734                 'title': title
3735                   }
3736             videos.append(info)
3737         return videos
3738
3739 class UstreamIE(InfoExtractor):
3740     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3741     IE_NAME = u'ustream'
3742
3743     def _real_extract(self, url):
3744         m = re.match(self._VALID_URL, url)
3745         video_id = m.group('videoID')
3746         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3747         webpage = self._download_webpage(url, video_id)
3748         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3749         title = m.group('title')
3750         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3751         uploader = m.group('uploader')
3752         info = {
3753                 'id':video_id,
3754                 'url':video_url,
3755                 'ext': 'flv',
3756                 'title': title,
3757                 'uploader': uploader
3758                   }
3759         return [info]
3760
3761
3762 def gen_extractors():
3763     """ Return a list of an instance of every supported extractor.
3764     The order does matter; the first extractor matched is the one handling the URL.
3765     """
3766     return [
3767         YoutubePlaylistIE(),
3768         YoutubeChannelIE(),
3769         YoutubeUserIE(),
3770         YoutubeSearchIE(),
3771         YoutubeIE(),
3772         MetacafeIE(),
3773         DailymotionIE(),
3774         GoogleSearchIE(),
3775         PhotobucketIE(),
3776         YahooIE(),
3777         YahooSearchIE(),
3778         DepositFilesIE(),
3779         FacebookIE(),
3780         BlipTVUserIE(),
3781         BlipTVIE(),
3782         VimeoIE(),
3783         MyVideoIE(),
3784         ComedyCentralIE(),
3785         EscapistIE(),
3786         CollegeHumorIE(),
3787         XVideosIE(),
3788         SoundcloudIE(),
3789         InfoQIE(),
3790         MixcloudIE(),
3791         StanfordOpenClassroomIE(),
3792         MTVIE(),
3793         YoukuIE(),
3794         XNXXIE(),
3795         GooglePlusIE(),
3796         ArteTvIE(),
3797         NBAIE(),
3798         JustinTVIE(),
3799         FunnyOrDieIE(),
3800         TweetReelIE(),
3801         SteamIE(),
3802         UstreamIE(),
3803         GenericIE()
3804     ]
3805
3806