_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37     uploader:       Full name of the video uploader.
  38     upload_date:    Video upload date (YYYYMMDD).
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104     @property
 105     def IE_NAME(self):
 106         return type(self).__name__[:-2]
 107
 108 class YoutubeIE(InfoExtractor):
 109     """Information extractor for youtube.com."""
 110
 111     _VALID_URL = r"""^
 112                      (
 113                          (?:https?://)?                                       # http(s):// (optional)
 114                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 115                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 116                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 117                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 118                          (?:                                                  # the various things that can precede the ID:
 119                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 120                              |(?:                                             # or the v= param in all its forms
 121                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 122                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 123                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 124                                  v=
 125                              )
 126                          )?                                                   # optional -> youtube.com/xxxx is OK
 127                      )?                                                       # all until now is optional -> you can pass the naked ID
 128                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 129                      (?(1).+)?                                                # if we found the ID, everything can follow
 130                      $"""
 131     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 132     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 133     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 134     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 135     _NETRC_MACHINE = 'youtube'
 136     # Listed in order of quality
 137     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 138     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 139     _video_extensions = {
 140         '13': '3gp',
 141         '17': 'mp4',
 142         '18': 'mp4',
 143         '22': 'mp4',
 144         '37': 'mp4',
 145         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 146         '43': 'webm',
 147         '44': 'webm',
 148         '45': 'webm',
 149         '46': 'webm',
 150     }
 151     _video_dimensions = {
 152         '5': '240x400',
 153         '6': '???',
 154         '13': '???',
 155         '17': '144x176',
 156         '18': '360x640',
 157         '22': '720x1280',
 158         '34': '360x640',
 159         '35': '480x854',
 160         '37': '1080x1920',
 161         '38': '3072x4096',
 162         '43': '360x640',
 163         '44': '480x854',
 164         '45': '720x1280',
 165         '46': '1080x1920',
 166     }
 167     IE_NAME = u'youtube'
 168
 169     def suitable(self, url):
 170         """Receives a URL and returns True if suitable for this IE."""
 171         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 172
 173     def report_lang(self):
 174         """Report attempt to set language."""
 175         self._downloader.to_screen(u'[youtube] Setting language')
 176
 177     def report_login(self):
 178         """Report attempt to log in."""
 179         self._downloader.to_screen(u'[youtube] Logging in')
 180
 181     def report_age_confirmation(self):
 182         """Report attempt to confirm age."""
 183         self._downloader.to_screen(u'[youtube] Confirming age')
 184
 185     def report_video_webpage_download(self, video_id):
 186         """Report attempt to download video webpage."""
 187         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 188
 189     def report_video_info_webpage_download(self, video_id):
 190         """Report attempt to download video info webpage."""
 191         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 192
 193     def report_video_subtitles_download(self, video_id):
 194         """Report attempt to download video info webpage."""
 195         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 196
 197     def report_information_extraction(self, video_id):
 198         """Report attempt to extract video information."""
 199         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 200
 201     def report_unavailable_format(self, video_id, format):
 202         """Report extracted video URL."""
 203         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 204
 205     def report_rtmp_download(self):
 206         """Indicate the download will use the RTMP protocol."""
 207         self._downloader.to_screen(u'[youtube] RTMP download detected')
 208
 209     def _closed_captions_xml_to_srt(self, xml_string):
 210         srt = ''
 211         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 212         # TODO parse xml instead of regex
 213         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 214             if not dur: dur = '4'
 215             start = float(start)
 216             end = start + float(dur)
 217             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 218             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 219             caption = unescapeHTML(caption)
 220             caption = unescapeHTML(caption) # double cycle, intentional
 221             srt += str(n+1) + '\n'
 222             srt += start + ' --> ' + end + '\n'
 223             srt += caption + '\n\n'
 224         return srt
 225
 226     def _extract_subtitles(self, video_id):
 227         self.report_video_subtitles_download(video_id)
 228         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 229         try:
 230             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 232             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 233         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 234         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 235         if not srt_lang_list:
 236             return (u'WARNING: video has no closed captions', None)
 237         if self._downloader.params.get('subtitleslang', False):
 238             srt_lang = self._downloader.params.get('subtitleslang')
 239         elif 'en' in srt_lang_list:
 240             srt_lang = 'en'
 241         else:
 242             srt_lang = list(srt_lang_list.keys())[0]
 243         if not srt_lang in srt_lang_list:
 244             return (u'WARNING: no closed captions found in the specified language', None)
 245         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 246         try:
 247             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 249             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 250         if not srt_xml:
 251             return (u'WARNING: unable to download video subtitles', None)
 252         return (None, self._closed_captions_xml_to_srt(srt_xml))
 253
 254     def _print_formats(self, formats):
 255         print('Available formats:')
 256         for x in formats:
 257             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 258
 259     def _real_initialize(self):
 260         if self._downloader is None:
 261             return
 262
 263         username = None
 264         password = None
 265         downloader_params = self._downloader.params
 266
 267         # Attempt to use provided username and password or .netrc data
 268         if downloader_params.get('username', None) is not None:
 269             username = downloader_params['username']
 270             password = downloader_params['password']
 271         elif downloader_params.get('usenetrc', False):
 272             try:
 273                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 274                 if info is not None:
 275                     username = info[0]
 276                     password = info[2]
 277                 else:
 278                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 279             except (IOError, netrc.NetrcParseError) as err:
 280                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 281                 return
 282
 283         # Set language
 284         request = compat_urllib_request.Request(self._LANG_URL)
 285         try:
 286             self.report_lang()
 287             compat_urllib_request.urlopen(request).read()
 288         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 289             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 290             return
 291
 292         # No authentication to be performed
 293         if username is None:
 294             return
 295
 296         # Log in
 297         login_form = {
 298                 'current_form': 'loginForm',
 299                 'next':     '/',
 300                 'action_login': 'Log In',
 301                 'username': username,
 302                 'password': password,
 303                 }
 304         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 305         try:
 306             self.report_login()
 307             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 308             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 309                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 310                 return
 311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 312             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 313             return
 314
 315         # Confirm age
 316         age_form = {
 317                 'next_url':     '/',
 318                 'action_confirm':   'Confirm',
 319                 }
 320         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 321         try:
 322             self.report_age_confirmation()
 323             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 326             return
 327
 328     def _extract_id(self, url):
 329         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 330         if mobj is None:
 331             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 332             return
 333         video_id = mobj.group(2)
 334         return video_id
 335
 336     def _real_extract(self, url):
 337         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 338         mobj = re.search(self._NEXT_URL_RE, url)
 339         if mobj:
 340             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 341         video_id = self._extract_id(url)
 342
 343         # Get video webpage
 344         self.report_video_webpage_download(video_id)
 345         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 346         request = compat_urllib_request.Request(url)
 347         try:
 348             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 349         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 350             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 351             return
 352
 353         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 354
 355         # Attempt to extract SWF player URL
 356         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 357         if mobj is not None:
 358             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 359         else:
 360             player_url = None
 361
 362         # Get video info
 363         self.report_video_info_webpage_download(video_id)
 364         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 365             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 366                     % (video_id, el_type))
 367             request = compat_urllib_request.Request(video_info_url)
 368             try:
 369                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 370                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 371                 video_info = compat_parse_qs(video_info_webpage)
 372                 if 'token' in video_info:
 373                     break
 374             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 376                 return
 377         if 'token' not in video_info:
 378             if 'reason' in video_info:
 379                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 380             else:
 381                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 382             return
 383
 384         # Check for "rental" videos
 385         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 386             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 387             return
 388
 389         # Start extracting information
 390         self.report_information_extraction(video_id)
 391
 392         # uploader
 393         if 'author' not in video_info:
 394             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 395             return
 396         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 397
 398         # uploader_id
 399         video_uploader_id = None
 400         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 401         if mobj is not None:
 402             video_uploader_id = mobj.group(1)
 403         else:
 404             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 405
 406         # title
 407         if 'title' not in video_info:
 408             self._downloader.trouble(u'ERROR: unable to extract video title')
 409             return
 410         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 411
 412         # thumbnail image
 413         if 'thumbnail_url' not in video_info:
 414             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 415             video_thumbnail = ''
 416         else:   # don't panic if we can't find it
 417             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 418
 419         # upload date
 420         upload_date = None
 421         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 422         if mobj is not None:
 423             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 424             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 425             for expression in format_expressions:
 426                 try:
 427                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 428                 except:
 429                     pass
 430
 431         # description
 432         video_description = get_element_by_id("eow-description", video_webpage)
 433         if video_description:
 434             video_description = clean_html(video_description)
 435         else:
 436             video_description = ''
 437
 438         # closed captions
 439         video_subtitles = None
 440         if self._downloader.params.get('writesubtitles', False):
 441             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 442             if srt_error:
 443                 self._downloader.trouble(srt_error)
 444
 445         if 'length_seconds' not in video_info:
 446             self._downloader.trouble(u'WARNING: unable to extract video duration')
 447             video_duration = ''
 448         else:
 449             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 450
 451         # token
 452         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 453
 454         # Decide which formats to download
 455         req_format = self._downloader.params.get('format', None)
 456
 457         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 458             self.report_rtmp_download()
 459             video_url_list = [(None, video_info['conn'][0])]
 460         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 461             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 462             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 463             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 464             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 465
 466             format_limit = self._downloader.params.get('format_limit', None)
 467             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 468             if format_limit is not None and format_limit in available_formats:
 469                 format_list = available_formats[available_formats.index(format_limit):]
 470             else:
 471                 format_list = available_formats
 472             existing_formats = [x for x in format_list if x in url_map]
 473             if len(existing_formats) == 0:
 474                 self._downloader.trouble(u'ERROR: no known formats available for video')
 475                 return
 476             if self._downloader.params.get('listformats', None):
 477                 self._print_formats(existing_formats)
 478                 return
 479             if req_format is None or req_format == 'best':
 480                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 481             elif req_format == 'worst':
 482                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 483             elif req_format in ('-1', 'all'):
 484                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 485             else:
 486                 # Specific formats. We pick the first in a slash-delimeted sequence.
 487                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 488                 req_formats = req_format.split('/')
 489                 video_url_list = None
 490                 for rf in req_formats:
 491                     if rf in url_map:
 492                         video_url_list = [(rf, url_map[rf])]
 493                         break
 494                 if video_url_list is None:
 495                     self._downloader.trouble(u'ERROR: requested format not available')
 496                     return
 497         else:
 498             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 499             return
 500
 501         results = []
 502         for format_param, video_real_url in video_url_list:
 503             # Extension
 504             video_extension = self._video_extensions.get(format_param, 'flv')
 505
 506             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 507                                               self._video_dimensions.get(format_param, '???'))
 508
 509             results.append({
 510                 'id':       video_id,
 511                 'url':      video_real_url,
 512                 'uploader': video_uploader,
 513                 'uploader_id': video_uploader_id,
 514                 'upload_date':  upload_date,
 515                 'title':    video_title,
 516                 'ext':      video_extension,
 517                 'format':   video_format,
 518                 'thumbnail':    video_thumbnail,
 519                 'description':  video_description,
 520                 'player_url':   player_url,
 521                 'subtitles':    video_subtitles,
 522                 'duration':     video_duration
 523             })
 524         return results
 525
 526
 527 class MetacafeIE(InfoExtractor):
 528     """Information Extractor for metacafe.com."""
 529
 530     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 531     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 532     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 533     IE_NAME = u'metacafe'
 534
 535     def __init__(self, downloader=None):
 536         InfoExtractor.__init__(self, downloader)
 537
 538     def report_disclaimer(self):
 539         """Report disclaimer retrieval."""
 540         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 541
 542     def report_age_confirmation(self):
 543         """Report attempt to confirm age."""
 544         self._downloader.to_screen(u'[metacafe] Confirming age')
 545
 546     def report_download_webpage(self, video_id):
 547         """Report webpage download."""
 548         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 549
 550     def report_extraction(self, video_id):
 551         """Report information extraction."""
 552         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 553
 554     def _real_initialize(self):
 555         # Retrieve disclaimer
 556         request = compat_urllib_request.Request(self._DISCLAIMER)
 557         try:
 558             self.report_disclaimer()
 559             disclaimer = compat_urllib_request.urlopen(request).read()
 560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 561             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 562             return
 563
 564         # Confirm age
 565         disclaimer_form = {
 566             'filters': '0',
 567             'submit': "Continue - I'm over 18",
 568             }
 569         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 570         try:
 571             self.report_age_confirmation()
 572             disclaimer = compat_urllib_request.urlopen(request).read()
 573         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 574             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 575             return
 576
 577     def _real_extract(self, url):
 578         # Extract id and simplified title from URL
 579         mobj = re.match(self._VALID_URL, url)
 580         if mobj is None:
 581             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 582             return
 583
 584         video_id = mobj.group(1)
 585
 586         # Check if video comes from YouTube
 587         mobj2 = re.match(r'^yt-(.*)$', video_id)
 588         if mobj2 is not None:
 589             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 590             return
 591
 592         # Retrieve video webpage to extract further information
 593         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 594         try:
 595             self.report_download_webpage(video_id)
 596             webpage = compat_urllib_request.urlopen(request).read()
 597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 598             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 599             return
 600
 601         # Extract URL, uploader and title from webpage
 602         self.report_extraction(video_id)
 603         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 604         if mobj is not None:
 605             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 606             video_extension = mediaURL[-3:]
 607
 608             # Extract gdaKey if available
 609             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 610             if mobj is None:
 611                 video_url = mediaURL
 612             else:
 613                 gdaKey = mobj.group(1)
 614                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 615         else:
 616             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 617             if mobj is None:
 618                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 619                 return
 620             vardict = compat_parse_qs(mobj.group(1))
 621             if 'mediaData' not in vardict:
 622                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 623                 return
 624             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 625             if mobj is None:
 626                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 627                 return
 628             mediaURL = mobj.group(1).replace('\\/', '/')
 629             video_extension = mediaURL[-3:]
 630             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 631
 632         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 633         if mobj is None:
 634             self._downloader.trouble(u'ERROR: unable to extract title')
 635             return
 636         video_title = mobj.group(1).decode('utf-8')
 637
 638         mobj = re.search(r'submitter=(.*?);', webpage)
 639         if mobj is None:
 640             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 641             return
 642         video_uploader = mobj.group(1)
 643
 644         return [{
 645             'id':       video_id.decode('utf-8'),
 646             'url':      video_url.decode('utf-8'),
 647             'uploader': video_uploader.decode('utf-8'),
 648             'upload_date':  None,
 649             'title':    video_title,
 650             'ext':      video_extension.decode('utf-8'),
 651         }]
 652
 653
 654 class DailymotionIE(InfoExtractor):
 655     """Information Extractor for Dailymotion"""
 656
 657     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 658     IE_NAME = u'dailymotion'
 659
 660     def __init__(self, downloader=None):
 661         InfoExtractor.__init__(self, downloader)
 662
 663     def report_download_webpage(self, video_id):
 664         """Report webpage download."""
 665         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 666
 667     def report_extraction(self, video_id):
 668         """Report information extraction."""
 669         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 670
 671     def _real_extract(self, url):
 672         # Extract id and simplified title from URL
 673         mobj = re.match(self._VALID_URL, url)
 674         if mobj is None:
 675             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 676             return
 677
 678         video_id = mobj.group(1).split('_')[0].split('?')[0]
 679
 680         video_extension = 'mp4'
 681
 682         # Retrieve video webpage to extract further information
 683         request = compat_urllib_request.Request(url)
 684         request.add_header('Cookie', 'family_filter=off')
 685         try:
 686             self.report_download_webpage(video_id)
 687             webpage_bytes = compat_urllib_request.urlopen(request).read()
 688             webpage = webpage_bytes.decode('utf-8')
 689         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 690             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 691             return
 692
 693         # Extract URL, uploader and title from webpage
 694         self.report_extraction(video_id)
 695         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 696         if mobj is None:
 697             self._downloader.trouble(u'ERROR: unable to extract media URL')
 698             return
 699         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 700
 701         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 702             if key in flashvars:
 703                 max_quality = key
 704                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 705                 break
 706         else:
 707             self._downloader.trouble(u'ERROR: unable to extract video URL')
 708             return
 709
 710         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 711         if mobj is None:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 716
 717         # TODO: support choosing qualities
 718
 719         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 720         if mobj is None:
 721             self._downloader.trouble(u'ERROR: unable to extract title')
 722             return
 723         video_title = unescapeHTML(mobj.group('title'))
 724
 725         video_uploader = None
 726         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 727         if mobj is None:
 728             # lookin for official user
 729             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 730             if mobj_official is None:
 731                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 732             else:
 733                 video_uploader = mobj_official.group(1)
 734         else:
 735             video_uploader = mobj.group(1)
 736
 737         video_upload_date = None
 738         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 739         if mobj is not None:
 740             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 741
 742         return [{
 743             'id':       video_id,
 744             'url':      video_url,
 745             'uploader': video_uploader,
 746             'upload_date':  video_upload_date,
 747             'title':    video_title,
 748             'ext':      video_extension,
 749         }]
 750
 751
 752 class PhotobucketIE(InfoExtractor):
 753     """Information extractor for photobucket.com."""
 754
 755     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 756     IE_NAME = u'photobucket'
 757
 758     def __init__(self, downloader=None):
 759         InfoExtractor.__init__(self, downloader)
 760
 761     def report_download_webpage(self, video_id):
 762         """Report webpage download."""
 763         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 764
 765     def report_extraction(self, video_id):
 766         """Report information extraction."""
 767         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 768
 769     def _real_extract(self, url):
 770         # Extract id from URL
 771         mobj = re.match(self._VALID_URL, url)
 772         if mobj is None:
 773             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 774             return
 775
 776         video_id = mobj.group(1)
 777
 778         video_extension = 'flv'
 779
 780         # Retrieve video webpage to extract further information
 781         request = compat_urllib_request.Request(url)
 782         try:
 783             self.report_download_webpage(video_id)
 784             webpage = compat_urllib_request.urlopen(request).read()
 785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 786             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 787             return
 788
 789         # Extract URL, uploader, and title from webpage
 790         self.report_extraction(video_id)
 791         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 792         if mobj is None:
 793             self._downloader.trouble(u'ERROR: unable to extract media URL')
 794             return
 795         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 796
 797         video_url = mediaURL
 798
 799         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 800         if mobj is None:
 801             self._downloader.trouble(u'ERROR: unable to extract title')
 802             return
 803         video_title = mobj.group(1).decode('utf-8')
 804
 805         video_uploader = mobj.group(2).decode('utf-8')
 806
 807         return [{
 808             'id':       video_id.decode('utf-8'),
 809             'url':      video_url.decode('utf-8'),
 810             'uploader': video_uploader,
 811             'upload_date':  None,
 812             'title':    video_title,
 813             'ext':      video_extension.decode('utf-8'),
 814         }]
 815
 816
 817 class YahooIE(InfoExtractor):
 818     """Information extractor for video.yahoo.com."""
 819
 820     _WORKING = False
 821     # _VALID_URL matches all Yahoo! Video URLs
 822     # _VPAGE_URL matches only the extractable '/watch/' URLs
 823     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 824     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 825     IE_NAME = u'video.yahoo'
 826
 827     def __init__(self, downloader=None):
 828         InfoExtractor.__init__(self, downloader)
 829
 830     def report_download_webpage(self, video_id):
 831         """Report webpage download."""
 832         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 833
 834     def report_extraction(self, video_id):
 835         """Report information extraction."""
 836         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 837
 838     def _real_extract(self, url, new_video=True):
 839         # Extract ID from URL
 840         mobj = re.match(self._VALID_URL, url)
 841         if mobj is None:
 842             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 843             return
 844
 845         video_id = mobj.group(2)
 846         video_extension = 'flv'
 847
 848         # Rewrite valid but non-extractable URLs as
 849         # extractable English language /watch/ URLs
 850         if re.match(self._VPAGE_URL, url) is None:
 851             request = compat_urllib_request.Request(url)
 852             try:
 853                 webpage = compat_urllib_request.urlopen(request).read()
 854             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 855                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 856                 return
 857
 858             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 859             if mobj is None:
 860                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 861                 return
 862             yahoo_id = mobj.group(1)
 863
 864             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 865             if mobj is None:
 866                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 867                 return
 868             yahoo_vid = mobj.group(1)
 869
 870             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 871             return self._real_extract(url, new_video=False)
 872
 873         # Retrieve video webpage to extract further information
 874         request = compat_urllib_request.Request(url)
 875         try:
 876             self.report_download_webpage(video_id)
 877             webpage = compat_urllib_request.urlopen(request).read()
 878         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 879             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 880             return
 881
 882         # Extract uploader and title from webpage
 883         self.report_extraction(video_id)
 884         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 885         if mobj is None:
 886             self._downloader.trouble(u'ERROR: unable to extract video title')
 887             return
 888         video_title = mobj.group(1).decode('utf-8')
 889
 890         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 891         if mobj is None:
 892             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 893             return
 894         video_uploader = mobj.group(1).decode('utf-8')
 895
 896         # Extract video thumbnail
 897         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 900             return
 901         video_thumbnail = mobj.group(1).decode('utf-8')
 902
 903         # Extract video description
 904         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 905         if mobj is None:
 906             self._downloader.trouble(u'ERROR: unable to extract video description')
 907             return
 908         video_description = mobj.group(1).decode('utf-8')
 909         if not video_description:
 910             video_description = 'No description available.'
 911
 912         # Extract video height and width
 913         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 914         if mobj is None:
 915             self._downloader.trouble(u'ERROR: unable to extract video height')
 916             return
 917         yv_video_height = mobj.group(1)
 918
 919         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 920         if mobj is None:
 921             self._downloader.trouble(u'ERROR: unable to extract video width')
 922             return
 923         yv_video_width = mobj.group(1)
 924
 925         # Retrieve video playlist to extract media URL
 926         # I'm not completely sure what all these options are, but we
 927         # seem to need most of them, otherwise the server sends a 401.
 928         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 929         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 930         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 931                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 932                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 933         try:
 934             self.report_download_webpage(video_id)
 935             webpage = compat_urllib_request.urlopen(request).read()
 936         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 937             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 938             return
 939
 940         # Extract media URL from playlist XML
 941         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 944             return
 945         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 946         video_url = unescapeHTML(video_url)
 947
 948         return [{
 949             'id':       video_id.decode('utf-8'),
 950             'url':      video_url,
 951             'uploader': video_uploader,
 952             'upload_date':  None,
 953             'title':    video_title,
 954             'ext':      video_extension.decode('utf-8'),
 955             'thumbnail':    video_thumbnail.decode('utf-8'),
 956             'description':  video_description,
 957         }]
 958
 959
 960 class VimeoIE(InfoExtractor):
 961     """Information extractor for vimeo.com."""
 962
 963     # _VALID_URL matches Vimeo URLs
 964     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 965     IE_NAME = u'vimeo'
 966
 967     def __init__(self, downloader=None):
 968         InfoExtractor.__init__(self, downloader)
 969
 970     def report_download_webpage(self, video_id):
 971         """Report webpage download."""
 972         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 973
 974     def report_extraction(self, video_id):
 975         """Report information extraction."""
 976         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 977
 978     def _real_extract(self, url, new_video=True):
 979         # Extract ID from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 983             return
 984
 985         video_id = mobj.group(1)
 986
 987         # Retrieve video webpage to extract further information
 988         request = compat_urllib_request.Request(url, None, std_headers)
 989         try:
 990             self.report_download_webpage(video_id)
 991             webpage_bytes = compat_urllib_request.urlopen(request).read()
 992             webpage = webpage_bytes.decode('utf-8')
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Now we begin extracting as much information as we can from what we
 998         # retrieved. First we extract the information common to all extractors,
 999         # and latter we extract those that are Vimeo specific.
1000         self.report_extraction(video_id)
1001
1002         # Extract the config JSON
1003         try:
1004             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005             config = json.loads(config)
1006         except:
1007             self._downloader.trouble(u'ERROR: unable to extract info section')
1008             return
1009
1010         # Extract title
1011         video_title = config["video"]["title"]
1012
1013         # Extract uploader and uploader_id
1014         video_uploader = config["video"]["owner"]["name"]
1015         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1016
1017         # Extract video thumbnail
1018         video_thumbnail = config["video"]["thumbnail"]
1019
1020         # Extract video description
1021         video_description = get_element_by_attribute("itemprop", "description", webpage)
1022         if video_description: video_description = clean_html(video_description)
1023         else: video_description = ''
1024
1025         # Extract upload date
1026         video_upload_date = None
1027         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028         if mobj is not None:
1029             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1030
1031         # Vimeo specific: extract request signature and timestamp
1032         sig = config['request']['signature']
1033         timestamp = config['request']['timestamp']
1034
1035         # Vimeo specific: extract video codec and quality information
1036         # First consider quality, then codecs, then take everything
1037         # TODO bind to format param
1038         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039         files = { 'hd': [], 'sd': [], 'other': []}
1040         for codec_name, codec_extension in codecs:
1041             if codec_name in config["video"]["files"]:
1042                 if 'hd' in config["video"]["files"][codec_name]:
1043                     files['hd'].append((codec_name, codec_extension, 'hd'))
1044                 elif 'sd' in config["video"]["files"][codec_name]:
1045                     files['sd'].append((codec_name, codec_extension, 'sd'))
1046                 else:
1047                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1048
1049         for quality in ('hd', 'sd', 'other'):
1050             if len(files[quality]) > 0:
1051                 video_quality = files[quality][0][2]
1052                 video_codec = files[quality][0][0]
1053                 video_extension = files[quality][0][1]
1054                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1055                 break
1056         else:
1057             self._downloader.trouble(u'ERROR: no known codec found')
1058             return
1059
1060         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1062
1063         return [{
1064             'id':       video_id,
1065             'url':      video_url,
1066             'uploader': video_uploader,
1067             'uploader_id': video_uploader_id,
1068             'upload_date':  video_upload_date,
1069             'title':    video_title,
1070             'ext':      video_extension,
1071             'thumbnail':    video_thumbnail,
1072             'description':  video_description,
1073         }]
1074
1075
1076 class ArteTvIE(InfoExtractor):
1077     """arte.tv information extractor."""
1078
1079     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080     _LIVE_URL = r'index-[0-9]+\.html$'
1081
1082     IE_NAME = u'arte.tv'
1083
1084     def __init__(self, downloader=None):
1085         InfoExtractor.__init__(self, downloader)
1086
1087     def report_download_webpage(self, video_id):
1088         """Report webpage download."""
1089         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1090
1091     def report_extraction(self, video_id):
1092         """Report information extraction."""
1093         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1094
1095     def fetch_webpage(self, url):
1096         request = compat_urllib_request.Request(url)
1097         try:
1098             self.report_download_webpage(url)
1099             webpage = compat_urllib_request.urlopen(request).read()
1100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1102             return
1103         except ValueError as err:
1104             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1105             return
1106         return webpage
1107
1108     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109         page = self.fetch_webpage(url)
1110         mobj = re.search(regex, page, regexFlags)
1111         info = {}
1112
1113         if mobj is None:
1114             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115             return
1116
1117         for (i, key, err) in matchTuples:
1118             if mobj.group(i) is None:
1119                 self._downloader.trouble(err)
1120                 return
1121             else:
1122                 info[key] = mobj.group(i)
1123
1124         return info
1125
1126     def extractLiveStream(self, url):
1127         video_lang = url.split('/')[-4]
1128         info = self.grep_webpage(
1129             url,
1130             r'src="(.*?/videothek_js.*?\.js)',
1131             0,
1132             [
1133                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1134             ]
1135         )
1136         http_host = url.split('/')[2]
1137         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138         info = self.grep_webpage(
1139             next_url,
1140             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141                 '(http://.*?\.swf).*?' +
1142                 '(rtmp://.*?)\'',
1143             re.DOTALL,
1144             [
1145                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1146                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1148             ]
1149         )
1150         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1151
1152     def extractPlus7Stream(self, url):
1153         video_lang = url.split('/')[-3]
1154         info = self.grep_webpage(
1155             url,
1156             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1157             0,
1158             [
1159                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1160             ]
1161         )
1162         next_url = compat_urllib_parse.unquote(info.get('url'))
1163         info = self.grep_webpage(
1164             next_url,
1165             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1166             0,
1167             [
1168                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1169             ]
1170         )
1171         next_url = compat_urllib_parse.unquote(info.get('url'))
1172
1173         info = self.grep_webpage(
1174             next_url,
1175             r'<video id="(.*?)".*?>.*?' +
1176                 '<name>(.*?)</name>.*?' +
1177                 '<dateVideo>(.*?)</dateVideo>.*?' +
1178                 '<url quality="hd">(.*?)</url>',
1179             re.DOTALL,
1180             [
1181                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1182                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1184                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1185             ]
1186         )
1187
1188         return {
1189             'id':           info.get('id'),
1190             'url':          compat_urllib_parse.unquote(info.get('url')),
1191             'uploader':     u'arte.tv',
1192             'upload_date':  info.get('date'),
1193             'title':        info.get('title').decode('utf-8'),
1194             'ext':          u'mp4',
1195             'format':       u'NA',
1196             'player_url':   None,
1197         }
1198
1199     def _real_extract(self, url):
1200         video_id = url.split('/')[-1]
1201         self.report_extraction(video_id)
1202
1203         if re.search(self._LIVE_URL, video_id) is not None:
1204             self.extractLiveStream(url)
1205             return
1206         else:
1207             info = self.extractPlus7Stream(url)
1208
1209         return [info]
1210
1211
1212 class GenericIE(InfoExtractor):
1213     """Generic last-resort information extractor."""
1214
1215     _VALID_URL = r'.*'
1216     IE_NAME = u'generic'
1217
1218     def __init__(self, downloader=None):
1219         InfoExtractor.__init__(self, downloader)
1220
1221     def report_download_webpage(self, video_id):
1222         """Report webpage download."""
1223         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1225
1226     def report_extraction(self, video_id):
1227         """Report information extraction."""
1228         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1229
1230     def report_following_redirect(self, new_url):
1231         """Report information extraction."""
1232         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1233
1234     def _test_redirect(self, url):
1235         """Check if it is a redirect, like url shorteners, in case restart chain."""
1236         class HeadRequest(compat_urllib_request.Request):
1237             def get_method(self):
1238                 return "HEAD"
1239
1240         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1241             """
1242             Subclass the HTTPRedirectHandler to make it use our
1243             HeadRequest also on the redirected URL
1244             """
1245             def redirect_request(self, req, fp, code, msg, headers, newurl):
1246                 if code in (301, 302, 303, 307):
1247                     newurl = newurl.replace(' ', '%20')
1248                     newheaders = dict((k,v) for k,v in req.headers.items()
1249                                       if k.lower() not in ("content-length", "content-type"))
1250                     return HeadRequest(newurl,
1251                                        headers=newheaders,
1252                                        origin_req_host=req.get_origin_req_host(),
1253                                        unverifiable=True)
1254                 else:
1255                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1256
1257         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1258             """
1259             Fallback to GET if HEAD is not allowed (405 HTTP error)
1260             """
1261             def http_error_405(self, req, fp, code, msg, headers):
1262                 fp.read()
1263                 fp.close()
1264
1265                 newheaders = dict((k,v) for k,v in req.headers.items()
1266                                   if k.lower() not in ("content-length", "content-type"))
1267                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1268                                                  headers=newheaders,
1269                                                  origin_req_host=req.get_origin_req_host(),
1270                                                  unverifiable=True))
1271
1272         # Build our opener
1273         opener = compat_urllib_request.OpenerDirector()
1274         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275                         HTTPMethodFallback, HEADRedirectHandler,
1276                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277             opener.add_handler(handler())
1278
1279         response = opener.open(HeadRequest(url))
1280         new_url = response.geturl()
1281
1282         if url == new_url:
1283             return False
1284
1285         self.report_following_redirect(new_url)
1286         self._downloader.download([new_url])
1287         return True
1288
1289     def _real_extract(self, url):
1290         if self._test_redirect(url): return
1291
1292         video_id = url.split('/')[-1]
1293         request = compat_urllib_request.Request(url)
1294         try:
1295             self.report_download_webpage(video_id)
1296             webpage = compat_urllib_request.urlopen(request).read()
1297         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1299             return
1300         except ValueError as err:
1301             # since this is the last-resort InfoExtractor, if
1302             # this error is thrown, it'll be thrown here
1303             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1304             return
1305
1306         self.report_extraction(video_id)
1307         # Start with something easy: JW Player in SWFObject
1308         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1309         if mobj is None:
1310             # Broaden the search a little bit
1311             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1312         if mobj is None:
1313             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314             return
1315
1316         # It's possible that one of the regexes
1317         # matched, but returned an empty group:
1318         if mobj.group(1) is None:
1319             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320             return
1321
1322         video_url = compat_urllib_parse.unquote(mobj.group(1))
1323         video_id = os.path.basename(video_url)
1324
1325         # here's a fun little line of code for you:
1326         video_extension = os.path.splitext(video_id)[1][1:]
1327         video_id = os.path.splitext(video_id)[0]
1328
1329         # it's tempting to parse this further, but you would
1330         # have to take into account all the variations like
1331         #   Video Title - Site Name
1332         #   Site Name | Video Title
1333         #   Video Title - Tagline | Site Name
1334         # and so on and so forth; it's just not practical
1335         mobj = re.search(r'<title>(.*)</title>', webpage)
1336         if mobj is None:
1337             self._downloader.trouble(u'ERROR: unable to extract title')
1338             return
1339         video_title = mobj.group(1)
1340
1341         # video uploader is domain name
1342         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1343         if mobj is None:
1344             self._downloader.trouble(u'ERROR: unable to extract title')
1345             return
1346         video_uploader = mobj.group(1)
1347
1348         return [{
1349             'id':       video_id,
1350             'url':      video_url,
1351             'uploader': video_uploader,
1352             'upload_date':  None,
1353             'title':    video_title,
1354             'ext':      video_extension,
1355         }]
1356
1357
1358 class YoutubeSearchIE(InfoExtractor):
1359     """Information Extractor for YouTube search queries."""
1360     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362     _max_youtube_results = 1000
1363     IE_NAME = u'youtube:search'
1364
1365     def __init__(self, downloader=None):
1366         InfoExtractor.__init__(self, downloader)
1367
1368     def report_download_page(self, query, pagenum):
1369         """Report attempt to download search page with given number."""
1370         query = query.decode(preferredencoding())
1371         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1372
1373     def _real_extract(self, query):
1374         mobj = re.match(self._VALID_URL, query)
1375         if mobj is None:
1376             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1377             return
1378
1379         prefix, query = query.split(':')
1380         prefix = prefix[8:]
1381         query = query.encode('utf-8')
1382         if prefix == '':
1383             self._download_n_results(query, 1)
1384             return
1385         elif prefix == 'all':
1386             self._download_n_results(query, self._max_youtube_results)
1387             return
1388         else:
1389             try:
1390                 n = int(prefix)
1391                 if n <= 0:
1392                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1393                     return
1394                 elif n > self._max_youtube_results:
1395                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396                     n = self._max_youtube_results
1397                 self._download_n_results(query, n)
1398                 return
1399             except ValueError: # parsing prefix as integer fails
1400                 self._download_n_results(query, 1)
1401                 return
1402
1403     def _download_n_results(self, query, n):
1404         """Downloads a specified number of results for a query"""
1405
1406         video_ids = []
1407         pagenum = 0
1408         limit = n
1409
1410         while (50 * pagenum) < limit:
1411             self.report_download_page(query, pagenum+1)
1412             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413             request = compat_urllib_request.Request(result_url)
1414             try:
1415                 data = compat_urllib_request.urlopen(request).read()
1416             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1418                 return
1419             api_response = json.loads(data)['data']
1420
1421             new_ids = list(video['id'] for video in api_response['items'])
1422             video_ids += new_ids
1423
1424             limit = min(n, api_response['totalItems'])
1425             pagenum += 1
1426
1427         if len(video_ids) > n:
1428             video_ids = video_ids[:n]
1429         for id in video_ids:
1430             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1431         return
1432
1433
1434 class GoogleSearchIE(InfoExtractor):
1435     """Information Extractor for Google Video search queries."""
1436     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440     _max_google_results = 1000
1441     IE_NAME = u'video.google:search'
1442
1443     def __init__(self, downloader=None):
1444         InfoExtractor.__init__(self, downloader)
1445
1446     def report_download_page(self, query, pagenum):
1447         """Report attempt to download playlist page with given number."""
1448         query = query.decode(preferredencoding())
1449         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1450
1451     def _real_extract(self, query):
1452         mobj = re.match(self._VALID_URL, query)
1453         if mobj is None:
1454             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455             return
1456
1457         prefix, query = query.split(':')
1458         prefix = prefix[8:]
1459         query = query.encode('utf-8')
1460         if prefix == '':
1461             self._download_n_results(query, 1)
1462             return
1463         elif prefix == 'all':
1464             self._download_n_results(query, self._max_google_results)
1465             return
1466         else:
1467             try:
1468                 n = int(prefix)
1469                 if n <= 0:
1470                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471                     return
1472                 elif n > self._max_google_results:
1473                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474                     n = self._max_google_results
1475                 self._download_n_results(query, n)
1476                 return
1477             except ValueError: # parsing prefix as integer fails
1478                 self._download_n_results(query, 1)
1479                 return
1480
1481     def _download_n_results(self, query, n):
1482         """Downloads a specified number of results for a query"""
1483
1484         video_ids = []
1485         pagenum = 0
1486
1487         while True:
1488             self.report_download_page(query, pagenum)
1489             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490             request = compat_urllib_request.Request(result_url)
1491             try:
1492                 page = compat_urllib_request.urlopen(request).read()
1493             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1495                 return
1496
1497             # Extract video identifiers
1498             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499                 video_id = mobj.group(1)
1500                 if video_id not in video_ids:
1501                     video_ids.append(video_id)
1502                     if len(video_ids) == n:
1503                         # Specified n videos reached
1504                         for id in video_ids:
1505                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506                         return
1507
1508             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509                 for id in video_ids:
1510                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511                 return
1512
1513             pagenum = pagenum + 1
1514
1515
1516 class YahooSearchIE(InfoExtractor):
1517     """Information Extractor for Yahoo! Video search queries."""
1518
1519     _WORKING = False
1520     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1521     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1522     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1523     _MORE_PAGES_INDICATOR = r'\s*Next'
1524     _max_yahoo_results = 1000
1525     IE_NAME = u'video.yahoo:search'
1526
1527     def __init__(self, downloader=None):
1528         InfoExtractor.__init__(self, downloader)
1529
1530     def report_download_page(self, query, pagenum):
1531         """Report attempt to download playlist page with given number."""
1532         query = query.decode(preferredencoding())
1533         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1534
1535     def _real_extract(self, query):
1536         mobj = re.match(self._VALID_URL, query)
1537         if mobj is None:
1538             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1539             return
1540
1541         prefix, query = query.split(':')
1542         prefix = prefix[8:]
1543         query = query.encode('utf-8')
1544         if prefix == '':
1545             self._download_n_results(query, 1)
1546             return
1547         elif prefix == 'all':
1548             self._download_n_results(query, self._max_yahoo_results)
1549             return
1550         else:
1551             try:
1552                 n = int(prefix)
1553                 if n <= 0:
1554                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1555                     return
1556                 elif n > self._max_yahoo_results:
1557                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1558                     n = self._max_yahoo_results
1559                 self._download_n_results(query, n)
1560                 return
1561             except ValueError: # parsing prefix as integer fails
1562                 self._download_n_results(query, 1)
1563                 return
1564
1565     def _download_n_results(self, query, n):
1566         """Downloads a specified number of results for a query"""
1567
1568         video_ids = []
1569         already_seen = set()
1570         pagenum = 1
1571
1572         while True:
1573             self.report_download_page(query, pagenum)
1574             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1575             request = compat_urllib_request.Request(result_url)
1576             try:
1577                 page = compat_urllib_request.urlopen(request).read()
1578             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1579                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1580                 return
1581
1582             # Extract video identifiers
1583             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1584                 video_id = mobj.group(1)
1585                 if video_id not in already_seen:
1586                     video_ids.append(video_id)
1587                     already_seen.add(video_id)
1588                     if len(video_ids) == n:
1589                         # Specified n videos reached
1590                         for id in video_ids:
1591                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592                         return
1593
1594             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595                 for id in video_ids:
1596                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597                 return
1598
1599             pagenum = pagenum + 1
1600
1601
1602 class YoutubePlaylistIE(InfoExtractor):
1603     """Information Extractor for YouTube playlists."""
1604
1605     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1606     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1607     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1608     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1609     IE_NAME = u'youtube:playlist'
1610
1611     def __init__(self, downloader=None):
1612         InfoExtractor.__init__(self, downloader)
1613
1614     def report_download_page(self, playlist_id, pagenum):
1615         """Report attempt to download playlist page with given number."""
1616         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1617
1618     def _real_extract(self, url):
1619         # Extract playlist id
1620         mobj = re.match(self._VALID_URL, url)
1621         if mobj is None:
1622             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1623             return
1624
1625         # Single video case
1626         if mobj.group(3) is not None:
1627             self._downloader.download([mobj.group(3)])
1628             return
1629
1630         # Download playlist pages
1631         # prefix is 'p' as default for playlists but there are other types that need extra care
1632         playlist_prefix = mobj.group(1)
1633         if playlist_prefix == 'a':
1634             playlist_access = 'artist'
1635         else:
1636             playlist_prefix = 'p'
1637             playlist_access = 'view_play_list'
1638         playlist_id = mobj.group(2)
1639         video_ids = []
1640         pagenum = 1
1641
1642         while True:
1643             self.report_download_page(playlist_id, pagenum)
1644             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1645             request = compat_urllib_request.Request(url)
1646             try:
1647                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1648             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650                 return
1651
1652             # Extract video identifiers
1653             ids_in_page = []
1654             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1655                 if mobj.group(1) not in ids_in_page:
1656                     ids_in_page.append(mobj.group(1))
1657             video_ids.extend(ids_in_page)
1658
1659             if self._MORE_PAGES_INDICATOR not in page:
1660                 break
1661             pagenum = pagenum + 1
1662
1663         total = len(video_ids)
1664
1665         playliststart = self._downloader.params.get('playliststart', 1) - 1
1666         playlistend = self._downloader.params.get('playlistend', -1)
1667         if playlistend == -1:
1668             video_ids = video_ids[playliststart:]
1669         else:
1670             video_ids = video_ids[playliststart:playlistend]
1671
1672         if len(video_ids) == total:
1673             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1674         else:
1675             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1676
1677         for id in video_ids:
1678             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1679         return
1680
1681
1682 class YoutubeChannelIE(InfoExtractor):
1683     """Information Extractor for YouTube channels."""
1684
1685     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1686     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688     IE_NAME = u'youtube:channel'
1689
1690     def report_download_page(self, channel_id, pagenum):
1691         """Report attempt to download channel page with given number."""
1692         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1693
1694     def _real_extract(self, url):
1695         # Extract channel id
1696         mobj = re.match(self._VALID_URL, url)
1697         if mobj is None:
1698             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1699             return
1700
1701         # Download channel pages
1702         channel_id = mobj.group(1)
1703         video_ids = []
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(channel_id, pagenum)
1708             url = self._TEMPLATE_URL % (channel_id, pagenum)
1709             request = compat_urllib_request.Request(url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             ids_in_page = []
1718             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1719                 if mobj.group(1) not in ids_in_page:
1720                     ids_in_page.append(mobj.group(1))
1721             video_ids.extend(ids_in_page)
1722
1723             if self._MORE_PAGES_INDICATOR not in page:
1724                 break
1725             pagenum = pagenum + 1
1726
1727         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1728
1729         for id in video_ids:
1730             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1731         return
1732
1733
1734 class YoutubeUserIE(InfoExtractor):
1735     """Information Extractor for YouTube users."""
1736
1737     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1738     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1739     _GDATA_PAGE_SIZE = 50
1740     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1741     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1742     IE_NAME = u'youtube:user'
1743
1744     def __init__(self, downloader=None):
1745         InfoExtractor.__init__(self, downloader)
1746
1747     def report_download_page(self, username, start_index):
1748         """Report attempt to download user page."""
1749         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1750                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1751
1752     def _real_extract(self, url):
1753         # Extract username
1754         mobj = re.match(self._VALID_URL, url)
1755         if mobj is None:
1756             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757             return
1758
1759         username = mobj.group(1)
1760
1761         # Download video ids using YouTube Data API. Result size per
1762         # query is limited (currently to 50 videos) so we need to query
1763         # page by page until there are no video ids - it means we got
1764         # all of them.
1765
1766         video_ids = []
1767         pagenum = 0
1768
1769         while True:
1770             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1771             self.report_download_page(username, start_index)
1772
1773             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1774
1775             try:
1776                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1777             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1779                 return
1780
1781             # Extract video identifiers
1782             ids_in_page = []
1783
1784             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785                 if mobj.group(1) not in ids_in_page:
1786                     ids_in_page.append(mobj.group(1))
1787
1788             video_ids.extend(ids_in_page)
1789
1790             # A little optimization - if current page is not
1791             # "full", ie. does not contain PAGE_SIZE video ids then
1792             # we can assume that this page is the last one - there
1793             # are no more ids on further pages - no need to query
1794             # again.
1795
1796             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1797                 break
1798
1799             pagenum += 1
1800
1801         all_ids_count = len(video_ids)
1802         playliststart = self._downloader.params.get('playliststart', 1) - 1
1803         playlistend = self._downloader.params.get('playlistend', -1)
1804
1805         if playlistend == -1:
1806             video_ids = video_ids[playliststart:]
1807         else:
1808             video_ids = video_ids[playliststart:playlistend]
1809
1810         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1811                 (username, all_ids_count, len(video_ids)))
1812
1813         for video_id in video_ids:
1814             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1815
1816
1817 class BlipTVUserIE(InfoExtractor):
1818     """Information Extractor for blip.tv users."""
1819
1820     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1821     _PAGE_SIZE = 12
1822     IE_NAME = u'blip.tv:user'
1823
1824     def __init__(self, downloader=None):
1825         InfoExtractor.__init__(self, downloader)
1826
1827     def report_download_page(self, username, pagenum):
1828         """Report attempt to download user page."""
1829         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1830                 (self.IE_NAME, username, pagenum))
1831
1832     def _real_extract(self, url):
1833         # Extract username
1834         mobj = re.match(self._VALID_URL, url)
1835         if mobj is None:
1836             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837             return
1838
1839         username = mobj.group(1)
1840
1841         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1842
1843         request = compat_urllib_request.Request(url)
1844
1845         try:
1846             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847             mobj = re.search(r'data-users-id="([^"]+)"', page)
1848             page_base = page_base % mobj.group(1)
1849         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1850             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1851             return
1852
1853
1854         # Download video ids using BlipTV Ajax calls. Result size per
1855         # query is limited (currently to 12 videos) so we need to query
1856         # page by page until there are no video ids - it means we got
1857         # all of them.
1858
1859         video_ids = []
1860         pagenum = 1
1861
1862         while True:
1863             self.report_download_page(username, pagenum)
1864
1865             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1866
1867             try:
1868                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1869             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1870                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871                 return
1872
1873             # Extract video identifiers
1874             ids_in_page = []
1875
1876             for mobj in re.finditer(r'href="/([^"]+)"', page):
1877                 if mobj.group(1) not in ids_in_page:
1878                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1879
1880             video_ids.extend(ids_in_page)
1881
1882             # A little optimization - if current page is not
1883             # "full", ie. does not contain PAGE_SIZE video ids then
1884             # we can assume that this page is the last one - there
1885             # are no more ids on further pages - no need to query
1886             # again.
1887
1888             if len(ids_in_page) < self._PAGE_SIZE:
1889                 break
1890
1891             pagenum += 1
1892
1893         all_ids_count = len(video_ids)
1894         playliststart = self._downloader.params.get('playliststart', 1) - 1
1895         playlistend = self._downloader.params.get('playlistend', -1)
1896
1897         if playlistend == -1:
1898             video_ids = video_ids[playliststart:]
1899         else:
1900             video_ids = video_ids[playliststart:playlistend]
1901
1902         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1903                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1904
1905         for video_id in video_ids:
1906             self._downloader.download([u'http://blip.tv/'+video_id])
1907
1908
1909 class DepositFilesIE(InfoExtractor):
1910     """Information extractor for depositfiles.com"""
1911
1912     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1913
1914     def report_download_webpage(self, file_id):
1915         """Report webpage download."""
1916         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1917
1918     def report_extraction(self, file_id):
1919         """Report information extraction."""
1920         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1921
1922     def _real_extract(self, url):
1923         file_id = url.split('/')[-1]
1924         # Rebuild url in english locale
1925         url = 'http://depositfiles.com/en/files/' + file_id
1926
1927         # Retrieve file webpage with 'Free download' button pressed
1928         free_download_indication = { 'gateway_result' : '1' }
1929         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1930         try:
1931             self.report_download_webpage(file_id)
1932             webpage = compat_urllib_request.urlopen(request).read()
1933         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1934             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1935             return
1936
1937         # Search for the real file URL
1938         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1939         if (mobj is None) or (mobj.group(1) is None):
1940             # Try to figure out reason of the error.
1941             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1942             if (mobj is not None) and (mobj.group(1) is not None):
1943                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1944                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1945             else:
1946                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1947             return
1948
1949         file_url = mobj.group(1)
1950         file_extension = os.path.splitext(file_url)[1][1:]
1951
1952         # Search for file title
1953         mobj = re.search(r'<b title="(.*?)">', webpage)
1954         if mobj is None:
1955             self._downloader.trouble(u'ERROR: unable to extract title')
1956             return
1957         file_title = mobj.group(1).decode('utf-8')
1958
1959         return [{
1960             'id':       file_id.decode('utf-8'),
1961             'url':      file_url.decode('utf-8'),
1962             'uploader': None,
1963             'upload_date':  None,
1964             'title':    file_title,
1965             'ext':      file_extension.decode('utf-8'),
1966         }]
1967
1968
1969 class FacebookIE(InfoExtractor):
1970     """Information Extractor for Facebook"""
1971
1972     _WORKING = False
1973     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1974     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1975     _NETRC_MACHINE = 'facebook'
1976     _available_formats = ['video', 'highqual', 'lowqual']
1977     _video_extensions = {
1978         'video': 'mp4',
1979         'highqual': 'mp4',
1980         'lowqual': 'mp4',
1981     }
1982     IE_NAME = u'facebook'
1983
1984     def __init__(self, downloader=None):
1985         InfoExtractor.__init__(self, downloader)
1986
1987     def _reporter(self, message):
1988         """Add header and report message."""
1989         self._downloader.to_screen(u'[facebook] %s' % message)
1990
1991     def report_login(self):
1992         """Report attempt to log in."""
1993         self._reporter(u'Logging in')
1994
1995     def report_video_webpage_download(self, video_id):
1996         """Report attempt to download video webpage."""
1997         self._reporter(u'%s: Downloading video webpage' % video_id)
1998
1999     def report_information_extraction(self, video_id):
2000         """Report attempt to extract video information."""
2001         self._reporter(u'%s: Extracting video information' % video_id)
2002
2003     def _parse_page(self, video_webpage):
2004         """Extract video information from page"""
2005         # General data
2006         data = {'title': r'\("video_title", "(.*?)"\)',
2007             'description': r'<div class="datawrap">(.*?)</div>',
2008             'owner': r'\("video_owner_name", "(.*?)"\)',
2009             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2010             }
2011         video_info = {}
2012         for piece in data.keys():
2013             mobj = re.search(data[piece], video_webpage)
2014             if mobj is not None:
2015                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2016
2017         # Video urls
2018         video_urls = {}
2019         for fmt in self._available_formats:
2020             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2021             if mobj is not None:
2022                 # URL is in a Javascript segment inside an escaped Unicode format within
2023                 # the generally utf-8 page
2024                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2025         video_info['video_urls'] = video_urls
2026
2027         return video_info
2028
2029     def _real_initialize(self):
2030         if self._downloader is None:
2031             return
2032
2033         useremail = None
2034         password = None
2035         downloader_params = self._downloader.params
2036
2037         # Attempt to use provided username and password or .netrc data
2038         if downloader_params.get('username', None) is not None:
2039             useremail = downloader_params['username']
2040             password = downloader_params['password']
2041         elif downloader_params.get('usenetrc', False):
2042             try:
2043                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2044                 if info is not None:
2045                     useremail = info[0]
2046                     password = info[2]
2047                 else:
2048                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2049             except (IOError, netrc.NetrcParseError) as err:
2050                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2051                 return
2052
2053         if useremail is None:
2054             return
2055
2056         # Log in
2057         login_form = {
2058             'email': useremail,
2059             'pass': password,
2060             'login': 'Log+In'
2061             }
2062         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2063         try:
2064             self.report_login()
2065             login_results = compat_urllib_request.urlopen(request).read()
2066             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2067                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2068                 return
2069         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2071             return
2072
2073     def _real_extract(self, url):
2074         mobj = re.match(self._VALID_URL, url)
2075         if mobj is None:
2076             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2077             return
2078         video_id = mobj.group('ID')
2079
2080         # Get video webpage
2081         self.report_video_webpage_download(video_id)
2082         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2083         try:
2084             page = compat_urllib_request.urlopen(request)
2085             video_webpage = page.read()
2086         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2087             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2088             return
2089
2090         # Start extracting information
2091         self.report_information_extraction(video_id)
2092
2093         # Extract information
2094         video_info = self._parse_page(video_webpage)
2095
2096         # uploader
2097         if 'owner' not in video_info:
2098             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2099             return
2100         video_uploader = video_info['owner']
2101
2102         # title
2103         if 'title' not in video_info:
2104             self._downloader.trouble(u'ERROR: unable to extract video title')
2105             return
2106         video_title = video_info['title']
2107         video_title = video_title.decode('utf-8')
2108
2109         # thumbnail image
2110         if 'thumbnail' not in video_info:
2111             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2112             video_thumbnail = ''
2113         else:
2114             video_thumbnail = video_info['thumbnail']
2115
2116         # upload date
2117         upload_date = None
2118         if 'upload_date' in video_info:
2119             upload_time = video_info['upload_date']
2120             timetuple = email.utils.parsedate_tz(upload_time)
2121             if timetuple is not None:
2122                 try:
2123                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2124                 except:
2125                     pass
2126
2127         # description
2128         video_description = video_info.get('description', 'No description available.')
2129
2130         url_map = video_info['video_urls']
2131         if url_map:
2132             # Decide which formats to download
2133             req_format = self._downloader.params.get('format', None)
2134             format_limit = self._downloader.params.get('format_limit', None)
2135
2136             if format_limit is not None and format_limit in self._available_formats:
2137                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2138             else:
2139                 format_list = self._available_formats
2140             existing_formats = [x for x in format_list if x in url_map]
2141             if len(existing_formats) == 0:
2142                 self._downloader.trouble(u'ERROR: no known formats available for video')
2143                 return
2144             if req_format is None:
2145                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2146             elif req_format == 'worst':
2147                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2148             elif req_format == '-1':
2149                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2150             else:
2151                 # Specific format
2152                 if req_format not in url_map:
2153                     self._downloader.trouble(u'ERROR: requested format not available')
2154                     return
2155                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2156
2157         results = []
2158         for format_param, video_real_url in video_url_list:
2159             # Extension
2160             video_extension = self._video_extensions.get(format_param, 'mp4')
2161
2162             results.append({
2163                 'id':       video_id.decode('utf-8'),
2164                 'url':      video_real_url.decode('utf-8'),
2165                 'uploader': video_uploader.decode('utf-8'),
2166                 'upload_date':  upload_date,
2167                 'title':    video_title,
2168                 'ext':      video_extension.decode('utf-8'),
2169                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2170                 'thumbnail':    video_thumbnail.decode('utf-8'),
2171                 'description':  video_description.decode('utf-8'),
2172             })
2173         return results
2174
2175 class BlipTVIE(InfoExtractor):
2176     """Information extractor for blip.tv"""
2177
2178     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2179     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2180     IE_NAME = u'blip.tv'
2181
2182     def report_extraction(self, file_id):
2183         """Report information extraction."""
2184         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2185
2186     def report_direct_download(self, title):
2187         """Report information extraction."""
2188         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2189
2190     def _real_extract(self, url):
2191         mobj = re.match(self._VALID_URL, url)
2192         if mobj is None:
2193             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2194             return
2195
2196         if '?' in url:
2197             cchar = '&'
2198         else:
2199             cchar = '?'
2200         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2201         request = compat_urllib_request.Request(json_url)
2202         self.report_extraction(mobj.group(1))
2203         info = None
2204         try:
2205             urlh = compat_urllib_request.urlopen(request)
2206             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2207                 basename = url.split('/')[-1]
2208                 title,ext = os.path.splitext(basename)
2209                 title = title.decode('UTF-8')
2210                 ext = ext.replace('.', '')
2211                 self.report_direct_download(title)
2212                 info = {
2213                     'id': title,
2214                     'url': url,
2215                     'uploader': None,
2216                     'upload_date': None,
2217                     'title': title,
2218                     'ext': ext,
2219                     'urlhandle': urlh
2220                 }
2221         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2222             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2223             return
2224         if info is None: # Regular URL
2225             try:
2226                 json_code_bytes = urlh.read()
2227                 json_code = json_code_bytes.decode('utf-8')
2228             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2229                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2230                 return
2231
2232             try:
2233                 json_data = json.loads(json_code)
2234                 if 'Post' in json_data:
2235                     data = json_data['Post']
2236                 else:
2237                     data = json_data
2238
2239                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2240                 video_url = data['media']['url']
2241                 umobj = re.match(self._URL_EXT, video_url)
2242                 if umobj is None:
2243                     raise ValueError('Can not determine filename extension')
2244                 ext = umobj.group(1)
2245
2246                 info = {
2247                     'id': data['item_id'],
2248                     'url': video_url,
2249                     'uploader': data['display_name'],
2250                     'upload_date': upload_date,
2251                     'title': data['title'],
2252                     'ext': ext,
2253                     'format': data['media']['mimeType'],
2254                     'thumbnail': data['thumbnailUrl'],
2255                     'description': data['description'],
2256                     'player_url': data['embedUrl']
2257                 }
2258             except (ValueError,KeyError) as err:
2259                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2260                 return
2261
2262         std_headers['User-Agent'] = 'iTunes/10.6.1'
2263         return [info]
2264
2265
2266 class MyVideoIE(InfoExtractor):
2267     """Information Extractor for myvideo.de."""
2268
2269     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2270     IE_NAME = u'myvideo'
2271
2272     def __init__(self, downloader=None):
2273         InfoExtractor.__init__(self, downloader)
2274
2275     def report_download_webpage(self, video_id):
2276         """Report webpage download."""
2277         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2278
2279     def report_extraction(self, video_id):
2280         """Report information extraction."""
2281         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2282
2283     def _real_extract(self,url):
2284         mobj = re.match(self._VALID_URL, url)
2285         if mobj is None:
2286             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2287             return
2288
2289         video_id = mobj.group(1)
2290
2291         # Get video webpage
2292         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2293         try:
2294             self.report_download_webpage(video_id)
2295             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2296         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2297             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2298             return
2299
2300         self.report_extraction(video_id)
2301         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2302                  webpage)
2303         if mobj is None:
2304             self._downloader.trouble(u'ERROR: unable to extract media URL')
2305             return
2306         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2307
2308         mobj = re.search('<title>([^<]+)</title>', webpage)
2309         if mobj is None:
2310             self._downloader.trouble(u'ERROR: unable to extract title')
2311             return
2312
2313         video_title = mobj.group(1)
2314
2315         return [{
2316             'id':       video_id,
2317             'url':      video_url,
2318             'uploader': None,
2319             'upload_date':  None,
2320             'title':    video_title,
2321             'ext':      u'flv',
2322         }]
2323
2324 class ComedyCentralIE(InfoExtractor):
2325     """Information extractor for The Daily Show and Colbert Report """
2326
2327     # urls can be abbreviations like :thedailyshow or :colbert
2328     # urls for episodes like:
2329     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2330     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2331     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2332     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2333                       |(https?://)?(www\.)?
2334                           (?P<showname>thedailyshow|colbertnation)\.com/
2335                          (full-episodes/(?P<episode>.*)|
2336                           (?P<clip>
2337                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2338                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2339                      $"""
2340     IE_NAME = u'comedycentral'
2341
2342     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2343
2344     _video_extensions = {
2345         '3500': 'mp4',
2346         '2200': 'mp4',
2347         '1700': 'mp4',
2348         '1200': 'mp4',
2349         '750': 'mp4',
2350         '400': 'mp4',
2351     }
2352     _video_dimensions = {
2353         '3500': '1280x720',
2354         '2200': '960x540',
2355         '1700': '768x432',
2356         '1200': '640x360',
2357         '750': '512x288',
2358         '400': '384x216',
2359     }
2360
2361     def suitable(self, url):
2362         """Receives a URL and returns True if suitable for this IE."""
2363         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2364
2365     def report_extraction(self, episode_id):
2366         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2367
2368     def report_config_download(self, episode_id):
2369         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2370
2371     def report_index_download(self, episode_id):
2372         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2373
2374     def report_player_url(self, episode_id):
2375         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2376
2377
2378     def _print_formats(self, formats):
2379         print('Available formats:')
2380         for x in formats:
2381             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2382
2383
2384     def _real_extract(self, url):
2385         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2386         if mobj is None:
2387             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2388             return
2389
2390         if mobj.group('shortname'):
2391             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2392                 url = u'http://www.thedailyshow.com/full-episodes/'
2393             else:
2394                 url = u'http://www.colbertnation.com/full-episodes/'
2395             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396             assert mobj is not None
2397
2398         if mobj.group('clip'):
2399             if mobj.group('showname') == 'thedailyshow':
2400                 epTitle = mobj.group('tdstitle')
2401             else:
2402                 epTitle = mobj.group('cntitle')
2403             dlNewest = False
2404         else:
2405             dlNewest = not mobj.group('episode')
2406             if dlNewest:
2407                 epTitle = mobj.group('showname')
2408             else:
2409                 epTitle = mobj.group('episode')
2410
2411         req = compat_urllib_request.Request(url)
2412         self.report_extraction(epTitle)
2413         try:
2414             htmlHandle = compat_urllib_request.urlopen(req)
2415             html = htmlHandle.read()
2416         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2418             return
2419         if dlNewest:
2420             url = htmlHandle.geturl()
2421             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422             if mobj is None:
2423                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2424                 return
2425             if mobj.group('episode') == '':
2426                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2427                 return
2428             epTitle = mobj.group('episode')
2429
2430         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2431
2432         if len(mMovieParams) == 0:
2433             # The Colbert Report embeds the information in a without
2434             # a URL prefix; so extract the alternate reference
2435             # and then add the URL prefix manually.
2436
2437             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2438             if len(altMovieParams) == 0:
2439                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2440                 return
2441             else:
2442                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2443
2444         playerUrl_raw = mMovieParams[0][0]
2445         self.report_player_url(epTitle)
2446         try:
2447             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2448             playerUrl = urlHandle.geturl()
2449         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2451             return
2452
2453         uri = mMovieParams[0][1]
2454         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2455         self.report_index_download(epTitle)
2456         try:
2457             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2458         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2460             return
2461
2462         results = []
2463
2464         idoc = xml.etree.ElementTree.fromstring(indexXml)
2465         itemEls = idoc.findall('.//item')
2466         for itemEl in itemEls:
2467             mediaId = itemEl.findall('./guid')[0].text
2468             shortMediaId = mediaId.split(':')[-1]
2469             showId = mediaId.split(':')[-2].replace('.com', '')
2470             officialTitle = itemEl.findall('./title')[0].text
2471             officialDate = itemEl.findall('./pubDate')[0].text
2472
2473             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2474                         compat_urllib_parse.urlencode({'uri': mediaId}))
2475             configReq = compat_urllib_request.Request(configUrl)
2476             self.report_config_download(epTitle)
2477             try:
2478                 configXml = compat_urllib_request.urlopen(configReq).read()
2479             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2481                 return
2482
2483             cdoc = xml.etree.ElementTree.fromstring(configXml)
2484             turls = []
2485             for rendition in cdoc.findall('.//rendition'):
2486                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2487                 turls.append(finfo)
2488
2489             if len(turls) == 0:
2490                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2491                 continue
2492
2493             if self._downloader.params.get('listformats', None):
2494                 self._print_formats([i[0] for i in turls])
2495                 return
2496
2497             # For now, just pick the highest bitrate
2498             format,video_url = turls[-1]
2499
2500             # Get the format arg from the arg stream
2501             req_format = self._downloader.params.get('format', None)
2502
2503             # Select format if we can find one
2504             for f,v in turls:
2505                 if f == req_format:
2506                     format, video_url = f, v
2507                     break
2508
2509             # Patch to download from alternative CDN, which does not
2510             # break on current RTMPDump builds
2511             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2512             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2513
2514             if video_url.startswith(broken_cdn):
2515                 video_url = video_url.replace(broken_cdn, better_cdn)
2516
2517             effTitle = showId + u'-' + epTitle
2518             info = {
2519                 'id': shortMediaId,
2520                 'url': video_url,
2521                 'uploader': showId,
2522                 'upload_date': officialDate,
2523                 'title': effTitle,
2524                 'ext': 'mp4',
2525                 'format': format,
2526                 'thumbnail': None,
2527                 'description': officialTitle,
2528                 'player_url': None #playerUrl
2529             }
2530
2531             results.append(info)
2532
2533         return results
2534
2535
2536 class EscapistIE(InfoExtractor):
2537     """Information extractor for The Escapist """
2538
2539     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540     IE_NAME = u'escapist'
2541
2542     def report_extraction(self, showName):
2543         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2544
2545     def report_config_download(self, showName):
2546         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2547
2548     def _real_extract(self, url):
2549         mobj = re.match(self._VALID_URL, url)
2550         if mobj is None:
2551             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2552             return
2553         showName = mobj.group('showname')
2554         videoId = mobj.group('episode')
2555
2556         self.report_extraction(showName)
2557         try:
2558             webPage = compat_urllib_request.urlopen(url)
2559             webPageBytes = webPage.read()
2560             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2561             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2562         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2564             return
2565
2566         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2567         description = unescapeHTML(descMatch.group(1))
2568         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2569         imgUrl = unescapeHTML(imgMatch.group(1))
2570         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2571         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572         configUrlMatch = re.search('config=(.*)$', playerUrl)
2573         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2574
2575         self.report_config_download(showName)
2576         try:
2577             configJSON = compat_urllib_request.urlopen(configUrl)
2578             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2579             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2580         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2582             return
2583
2584         # Technically, it's JavaScript, not JSON
2585         configJSON = configJSON.replace("'", '"')
2586
2587         try:
2588             config = json.loads(configJSON)
2589         except (ValueError,) as err:
2590             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2591             return
2592
2593         playlist = config['playlist']
2594         videoUrl = playlist[1]['url']
2595
2596         info = {
2597             'id': videoId,
2598             'url': videoUrl,
2599             'uploader': showName,
2600             'upload_date': None,
2601             'title': showName,
2602             'ext': 'flv',
2603             'thumbnail': imgUrl,
2604             'description': description,
2605             'player_url': playerUrl,
2606         }
2607
2608         return [info]
2609
2610
2611 class CollegeHumorIE(InfoExtractor):
2612     """Information extractor for collegehumor.com"""
2613
2614     _WORKING = False
2615     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616     IE_NAME = u'collegehumor'
2617
2618     def report_manifest(self, video_id):
2619         """Report information extraction."""
2620         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621
2622     def report_extraction(self, video_id):
2623         """Report information extraction."""
2624         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625
2626     def _real_extract(self, url):
2627         mobj = re.match(self._VALID_URL, url)
2628         if mobj is None:
2629             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2630             return
2631         video_id = mobj.group('videoid')
2632
2633         info = {
2634             'id': video_id,
2635             'uploader': None,
2636             'upload_date': None,
2637         }
2638
2639         self.report_extraction(video_id)
2640         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641         try:
2642             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2645             return
2646
2647         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648         try:
2649             videoNode = mdoc.findall('./video')[0]
2650             info['description'] = videoNode.findall('./description')[0].text
2651             info['title'] = videoNode.findall('./caption')[0].text
2652             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653             manifest_url = videoNode.findall('./file')[0].text
2654         except IndexError:
2655             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2656             return
2657
2658         manifest_url += '?hdcore=2.10.3'
2659         self.report_manifest(video_id)
2660         try:
2661             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2664             return
2665
2666         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667         try:
2668             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669             node_id = media_node.attrib['url']
2670             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671         except IndexError as err:
2672             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2673             return
2674
2675         url_pr = compat_urllib_parse_urlparse(manifest_url)
2676         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2677
2678         info['url'] = url
2679         info['ext'] = 'f4f'
2680         return [info]
2681
2682
2683 class XVideosIE(InfoExtractor):
2684     """Information extractor for xvideos.com"""
2685
2686     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687     IE_NAME = u'xvideos'
2688
2689     def report_webpage(self, video_id):
2690         """Report information extraction."""
2691         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2692
2693     def report_extraction(self, video_id):
2694         """Report information extraction."""
2695         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2696
2697     def _real_extract(self, url):
2698         mobj = re.match(self._VALID_URL, url)
2699         if mobj is None:
2700             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701             return
2702         video_id = mobj.group(1)
2703
2704         self.report_webpage(video_id)
2705
2706         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2707         try:
2708             webpage_bytes = compat_urllib_request.urlopen(request).read()
2709             webpage = webpage_bytes.decode('utf-8', 'replace')
2710         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2711             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2712             return
2713
2714         self.report_extraction(video_id)
2715
2716
2717         # Extract video URL
2718         mobj = re.search(r'flv_url=(.+?)&', webpage)
2719         if mobj is None:
2720             self._downloader.trouble(u'ERROR: unable to extract video url')
2721             return
2722         video_url = compat_urllib_parse.unquote(mobj.group(1))
2723
2724
2725         # Extract title
2726         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2727         if mobj is None:
2728             self._downloader.trouble(u'ERROR: unable to extract video title')
2729             return
2730         video_title = mobj.group(1)
2731
2732
2733         # Extract video thumbnail
2734         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2735         if mobj is None:
2736             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2737             return
2738         video_thumbnail = mobj.group(0)
2739
2740         info = {
2741             'id': video_id,
2742             'url': video_url,
2743             'uploader': None,
2744             'upload_date': None,
2745             'title': video_title,
2746             'ext': 'flv',
2747             'thumbnail': video_thumbnail,
2748             'description': None,
2749         }
2750
2751         return [info]
2752
2753
2754 class SoundcloudIE(InfoExtractor):
2755     """Information extractor for soundcloud.com
2756        To access the media, the uid of the song and a stream token
2757        must be extracted from the page source and the script must make
2758        a request to media.soundcloud.com/crossdomain.xml. Then
2759        the media can be grabbed by requesting from an url composed
2760        of the stream token and uid
2761      """
2762
2763     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2764     IE_NAME = u'soundcloud'
2765
2766     def __init__(self, downloader=None):
2767         InfoExtractor.__init__(self, downloader)
2768
2769     def report_resolve(self, video_id):
2770         """Report information extraction."""
2771         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2772
2773     def report_extraction(self, video_id):
2774         """Report information extraction."""
2775         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2776
2777     def _real_extract(self, url):
2778         mobj = re.match(self._VALID_URL, url)
2779         if mobj is None:
2780             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2781             return
2782
2783         # extract uploader (which is in the url)
2784         uploader = mobj.group(1)
2785         # extract simple title (uploader + slug of song title)
2786         slug_title =  mobj.group(2)
2787         simple_title = uploader + u'-' + slug_title
2788
2789         self.report_resolve('%s/%s' % (uploader, slug_title))
2790
2791         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2792         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793         request = compat_urllib_request.Request(resolv_url)
2794         try:
2795             info_json_bytes = compat_urllib_request.urlopen(request).read()
2796             info_json = info_json_bytes.decode('utf-8')
2797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2798             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2799             return
2800
2801         info = json.loads(info_json)
2802         video_id = info['id']
2803         self.report_extraction('%s/%s' % (uploader, slug_title))
2804
2805         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2806         request = compat_urllib_request.Request(streams_url)
2807         try:
2808             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2809             stream_json = stream_json_bytes.decode('utf-8')
2810         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2812             return
2813
2814         streams = json.loads(stream_json)
2815         mediaURL = streams['http_mp3_128_url']
2816
2817         return [{
2818             'id':       info['id'],
2819             'url':      mediaURL,
2820             'uploader': info['user']['username'],
2821             'upload_date':  info['created_at'],
2822             'title':    info['title'],
2823             'ext':      u'mp3',
2824             'description': info['description'],
2825         }]
2826
2827
2828 class InfoQIE(InfoExtractor):
2829     """Information extractor for infoq.com"""
2830
2831     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2832     IE_NAME = u'infoq'
2833
2834     def report_webpage(self, video_id):
2835         """Report information extraction."""
2836         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2837
2838     def report_extraction(self, video_id):
2839         """Report information extraction."""
2840         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2841
2842     def _real_extract(self, url):
2843         mobj = re.match(self._VALID_URL, url)
2844         if mobj is None:
2845             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2846             return
2847
2848         self.report_webpage(url)
2849
2850         request = compat_urllib_request.Request(url)
2851         try:
2852             webpage = compat_urllib_request.urlopen(request).read()
2853         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2854             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2855             return
2856
2857         self.report_extraction(url)
2858
2859
2860         # Extract video URL
2861         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2862         if mobj is None:
2863             self._downloader.trouble(u'ERROR: unable to extract video url')
2864             return
2865         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2866
2867
2868         # Extract title
2869         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2870         if mobj is None:
2871             self._downloader.trouble(u'ERROR: unable to extract video title')
2872             return
2873         video_title = mobj.group(1).decode('utf-8')
2874
2875         # Extract description
2876         video_description = u'No description available.'
2877         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2878         if mobj is not None:
2879             video_description = mobj.group(1).decode('utf-8')
2880
2881         video_filename = video_url.split('/')[-1]
2882         video_id, extension = video_filename.split('.')
2883
2884         info = {
2885             'id': video_id,
2886             'url': video_url,
2887             'uploader': None,
2888             'upload_date': None,
2889             'title': video_title,
2890             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2891             'thumbnail': None,
2892             'description': video_description,
2893         }
2894
2895         return [info]
2896
2897 class MixcloudIE(InfoExtractor):
2898     """Information extractor for www.mixcloud.com"""
2899
2900     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2901     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2902     IE_NAME = u'mixcloud'
2903
2904     def __init__(self, downloader=None):
2905         InfoExtractor.__init__(self, downloader)
2906
2907     def report_download_json(self, file_id):
2908         """Report JSON download."""
2909         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2910
2911     def report_extraction(self, file_id):
2912         """Report information extraction."""
2913         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2914
2915     def get_urls(self, jsonData, fmt, bitrate='best'):
2916         """Get urls from 'audio_formats' section in json"""
2917         file_url = None
2918         try:
2919             bitrate_list = jsonData[fmt]
2920             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2921                 bitrate = max(bitrate_list) # select highest
2922
2923             url_list = jsonData[fmt][bitrate]
2924         except TypeError: # we have no bitrate info.
2925             url_list = jsonData[fmt]
2926         return url_list
2927
2928     def check_urls(self, url_list):
2929         """Returns 1st active url from list"""
2930         for url in url_list:
2931             try:
2932                 compat_urllib_request.urlopen(url)
2933                 return url
2934             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2935                 url = None
2936
2937         return None
2938
2939     def _print_formats(self, formats):
2940         print('Available formats:')
2941         for fmt in formats.keys():
2942             for b in formats[fmt]:
2943                 try:
2944                     ext = formats[fmt][b][0]
2945                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2946                 except TypeError: # we have no bitrate info
2947                     ext = formats[fmt][0]
2948                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2949                     break
2950
2951     def _real_extract(self, url):
2952         mobj = re.match(self._VALID_URL, url)
2953         if mobj is None:
2954             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955             return
2956         # extract uploader & filename from url
2957         uploader = mobj.group(1).decode('utf-8')
2958         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2959
2960         # construct API request
2961         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2962         # retrieve .json file with links to files
2963         request = compat_urllib_request.Request(file_url)
2964         try:
2965             self.report_download_json(file_url)
2966             jsonData = compat_urllib_request.urlopen(request).read()
2967         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2968             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2969             return
2970
2971         # parse JSON
2972         json_data = json.loads(jsonData)
2973         player_url = json_data['player_swf_url']
2974         formats = dict(json_data['audio_formats'])
2975
2976         req_format = self._downloader.params.get('format', None)
2977         bitrate = None
2978
2979         if self._downloader.params.get('listformats', None):
2980             self._print_formats(formats)
2981             return
2982
2983         if req_format is None or req_format == 'best':
2984             for format_param in formats.keys():
2985                 url_list = self.get_urls(formats, format_param)
2986                 # check urls
2987                 file_url = self.check_urls(url_list)
2988                 if file_url is not None:
2989                     break # got it!
2990         else:
2991             if req_format not in formats:
2992                 self._downloader.trouble(u'ERROR: format is not available')
2993                 return
2994
2995             url_list = self.get_urls(formats, req_format)
2996             file_url = self.check_urls(url_list)
2997             format_param = req_format
2998
2999         return [{
3000             'id': file_id.decode('utf-8'),
3001             'url': file_url.decode('utf-8'),
3002             'uploader': uploader.decode('utf-8'),
3003             'upload_date': None,
3004             'title': json_data['name'],
3005             'ext': file_url.split('.')[-1].decode('utf-8'),
3006             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3007             'thumbnail': json_data['thumbnail_url'],
3008             'description': json_data['description'],
3009             'player_url': player_url.decode('utf-8'),
3010         }]
3011
3012 class StanfordOpenClassroomIE(InfoExtractor):
3013     """Information extractor for Stanford's Open ClassRoom"""
3014
3015     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3016     IE_NAME = u'stanfordoc'
3017
3018     def report_download_webpage(self, objid):
3019         """Report information extraction."""
3020         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3021
3022     def report_extraction(self, video_id):
3023         """Report information extraction."""
3024         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3025
3026     def _real_extract(self, url):
3027         mobj = re.match(self._VALID_URL, url)
3028         if mobj is None:
3029             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3030             return
3031
3032         if mobj.group('course') and mobj.group('video'): # A specific video
3033             course = mobj.group('course')
3034             video = mobj.group('video')
3035             info = {
3036                 'id': course + '_' + video,
3037                 'uploader': None,
3038                 'upload_date': None,
3039             }
3040
3041             self.report_extraction(info['id'])
3042             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3043             xmlUrl = baseUrl + video + '.xml'
3044             try:
3045                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3046             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3047                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3048                 return
3049             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3050             try:
3051                 info['title'] = mdoc.findall('./title')[0].text
3052                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3053             except IndexError:
3054                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3055                 return
3056             info['ext'] = info['url'].rpartition('.')[2]
3057             return [info]
3058         elif mobj.group('course'): # A course page
3059             course = mobj.group('course')
3060             info = {
3061                 'id': course,
3062                 'type': 'playlist',
3063                 'uploader': None,
3064                 'upload_date': None,
3065             }
3066
3067             self.report_download_webpage(info['id'])
3068             try:
3069                 coursepage = compat_urllib_request.urlopen(url).read()
3070             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3071                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3072                 return
3073
3074             m = re.search('<h1>([^<]+)</h1>', coursepage)
3075             if m:
3076                 info['title'] = unescapeHTML(m.group(1))
3077             else:
3078                 info['title'] = info['id']
3079
3080             m = re.search('<description>([^<]+)</description>', coursepage)
3081             if m:
3082                 info['description'] = unescapeHTML(m.group(1))
3083
3084             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3085             info['list'] = [
3086                 {
3087                     'type': 'reference',
3088                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3089                 }
3090                     for vpage in links]
3091             results = []
3092             for entry in info['list']:
3093                 assert entry['type'] == 'reference'
3094                 results += self.extract(entry['url'])
3095             return results
3096
3097         else: # Root page
3098             info = {
3099                 'id': 'Stanford OpenClassroom',
3100                 'type': 'playlist',
3101                 'uploader': None,
3102                 'upload_date': None,
3103             }
3104
3105             self.report_download_webpage(info['id'])
3106             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3107             try:
3108                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3109             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3110                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3111                 return
3112
3113             info['title'] = info['id']
3114
3115             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3116             info['list'] = [
3117                 {
3118                     'type': 'reference',
3119                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3120                 }
3121                     for cpage in links]
3122
3123             results = []
3124             for entry in info['list']:
3125                 assert entry['type'] == 'reference'
3126                 results += self.extract(entry['url'])
3127             return results
3128
3129 class MTVIE(InfoExtractor):
3130     """Information extractor for MTV.com"""
3131
3132     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3133     IE_NAME = u'mtv'
3134
3135     def report_webpage(self, video_id):
3136         """Report information extraction."""
3137         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3138
3139     def report_extraction(self, video_id):
3140         """Report information extraction."""
3141         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3142
3143     def _real_extract(self, url):
3144         mobj = re.match(self._VALID_URL, url)
3145         if mobj is None:
3146             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3147             return
3148         if not mobj.group('proto'):
3149             url = 'http://' + url
3150         video_id = mobj.group('videoid')
3151         self.report_webpage(video_id)
3152
3153         request = compat_urllib_request.Request(url)
3154         try:
3155             webpage = compat_urllib_request.urlopen(request).read()
3156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3157             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3158             return
3159
3160         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3161         if mobj is None:
3162             self._downloader.trouble(u'ERROR: unable to extract song name')
3163             return
3164         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3165         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3166         if mobj is None:
3167             self._downloader.trouble(u'ERROR: unable to extract performer')
3168             return
3169         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3170         video_title = performer + ' - ' + song_name
3171
3172         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3173         if mobj is None:
3174             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3175             return
3176         mtvn_uri = mobj.group(1)
3177
3178         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3179         if mobj is None:
3180             self._downloader.trouble(u'ERROR: unable to extract content id')
3181             return
3182         content_id = mobj.group(1)
3183
3184         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3185         self.report_extraction(video_id)
3186         request = compat_urllib_request.Request(videogen_url)
3187         try:
3188             metadataXml = compat_urllib_request.urlopen(request).read()
3189         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3190             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3191             return
3192
3193         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3194         renditions = mdoc.findall('.//rendition')
3195
3196         # For now, always pick the highest quality.
3197         rendition = renditions[-1]
3198
3199         try:
3200             _,_,ext = rendition.attrib['type'].partition('/')
3201             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3202             video_url = rendition.find('./src').text
3203         except KeyError:
3204             self._downloader.trouble('Invalid rendition field.')
3205             return
3206
3207         info = {
3208             'id': video_id,
3209             'url': video_url,
3210             'uploader': performer,
3211             'upload_date': None,
3212             'title': video_title,
3213             'ext': ext,
3214             'format': format,
3215         }
3216
3217         return [info]
3218
3219
3220 class YoukuIE(InfoExtractor):
3221     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3222
3223     def report_download_webpage(self, file_id):
3224         """Report webpage download."""
3225         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3226
3227     def report_extraction(self, file_id):
3228         """Report information extraction."""
3229         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3230
3231     def _gen_sid(self):
3232         nowTime = int(time.time() * 1000)
3233         random1 = random.randint(1000,1998)
3234         random2 = random.randint(1000,9999)
3235
3236         return "%d%d%d" %(nowTime,random1,random2)
3237
3238     def _get_file_ID_mix_string(self, seed):
3239         mixed = []
3240         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3241         seed = float(seed)
3242         for i in range(len(source)):
3243             seed  =  (seed * 211 + 30031 ) % 65536
3244             index  =  math.floor(seed / 65536 * len(source) )
3245             mixed.append(source[int(index)])
3246             source.remove(source[int(index)])
3247         #return ''.join(mixed)
3248         return mixed
3249
3250     def _get_file_id(self, fileId, seed):
3251         mixed = self._get_file_ID_mix_string(seed)
3252         ids = fileId.split('*')
3253         realId = []
3254         for ch in ids:
3255             if ch:
3256                 realId.append(mixed[int(ch)])
3257         return ''.join(realId)
3258
3259     def _real_extract(self, url):
3260         mobj = re.match(self._VALID_URL, url)
3261         if mobj is None:
3262             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3263             return
3264         video_id = mobj.group('ID')
3265
3266         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3267
3268         request = compat_urllib_request.Request(info_url, None, std_headers)
3269         try:
3270             self.report_download_webpage(video_id)
3271             jsondata = compat_urllib_request.urlopen(request).read()
3272         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3273             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3274             return
3275
3276         self.report_extraction(video_id)
3277         try:
3278             jsonstr = jsondata.decode('utf-8')
3279             config = json.loads(jsonstr)
3280
3281             video_title =  config['data'][0]['title']
3282             seed = config['data'][0]['seed']
3283
3284             format = self._downloader.params.get('format', None)
3285             supported_format = list(config['data'][0]['streamfileids'].keys())
3286
3287             if format is None or format == 'best':
3288                 if 'hd2' in supported_format:
3289                     format = 'hd2'
3290                 else:
3291                     format = 'flv'
3292                 ext = u'flv'
3293             elif format == 'worst':
3294                 format = 'mp4'
3295                 ext = u'mp4'
3296             else:
3297                 format = 'flv'
3298                 ext = u'flv'
3299
3300
3301             fileid = config['data'][0]['streamfileids'][format]
3302             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3303         except (UnicodeDecodeError, ValueError, KeyError):
3304             self._downloader.trouble(u'ERROR: unable to extract info section')
3305             return
3306
3307         files_info=[]
3308         sid = self._gen_sid()
3309         fileid = self._get_file_id(fileid, seed)
3310
3311         #column 8,9 of fileid represent the segment number
3312         #fileid[7:9] should be changed
3313         for index, key in enumerate(keys):
3314
3315             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3316             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3317
3318             info = {
3319                 'id': '%s_part%02d' % (video_id, index),
3320                 'url': download_url,
3321                 'uploader': None,
3322                 'upload_date': None,
3323                 'title': video_title,
3324                 'ext': ext,
3325             }
3326             files_info.append(info)
3327
3328         return files_info
3329
3330
3331 class XNXXIE(InfoExtractor):
3332     """Information extractor for xnxx.com"""
3333
3334     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3335     IE_NAME = u'xnxx'
3336     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3337     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3338     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3339
3340     def report_webpage(self, video_id):
3341         """Report information extraction"""
3342         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3343
3344     def report_extraction(self, video_id):
3345         """Report information extraction"""
3346         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3347
3348     def _real_extract(self, url):
3349         mobj = re.match(self._VALID_URL, url)
3350         if mobj is None:
3351             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3352             return
3353         video_id = mobj.group(1)
3354
3355         self.report_webpage(video_id)
3356
3357         # Get webpage content
3358         try:
3359             webpage_bytes = compat_urllib_request.urlopen(url).read()
3360             webpage = webpage_bytes.decode('utf-8')
3361         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3362             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3363             return
3364
3365         result = re.search(self.VIDEO_URL_RE, webpage)
3366         if result is None:
3367             self._downloader.trouble(u'ERROR: unable to extract video url')
3368             return
3369         video_url = compat_urllib_parse.unquote(result.group(1))
3370
3371         result = re.search(self.VIDEO_TITLE_RE, webpage)
3372         if result is None:
3373             self._downloader.trouble(u'ERROR: unable to extract video title')
3374             return
3375         video_title = result.group(1)
3376
3377         result = re.search(self.VIDEO_THUMB_RE, webpage)
3378         if result is None:
3379             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3380             return
3381         video_thumbnail = result.group(1)
3382
3383         return [{
3384             'id': video_id,
3385             'url': video_url,
3386             'uploader': None,
3387             'upload_date': None,
3388             'title': video_title,
3389             'ext': 'flv',
3390             'thumbnail': video_thumbnail,
3391             'description': None,
3392         }]
3393
3394
3395 class GooglePlusIE(InfoExtractor):
3396     """Information extractor for plus.google.com."""
3397
3398     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3399     IE_NAME = u'plus.google'
3400
3401     def __init__(self, downloader=None):
3402         InfoExtractor.__init__(self, downloader)
3403
3404     def report_extract_entry(self, url):
3405         """Report downloading extry"""
3406         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3407
3408     def report_date(self, upload_date):
3409         """Report downloading extry"""
3410         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3411
3412     def report_uploader(self, uploader):
3413         """Report downloading extry"""
3414         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3415
3416     def report_title(self, video_title):
3417         """Report downloading extry"""
3418         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3419
3420     def report_extract_vid_page(self, video_page):
3421         """Report information extraction."""
3422         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3423
3424     def _real_extract(self, url):
3425         # Extract id from URL
3426         mobj = re.match(self._VALID_URL, url)
3427         if mobj is None:
3428             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3429             return
3430
3431         post_url = mobj.group(0)
3432         video_id = mobj.group(1)
3433
3434         video_extension = 'flv'
3435
3436         # Step 1, Retrieve post webpage to extract further information
3437         self.report_extract_entry(post_url)
3438         request = compat_urllib_request.Request(post_url)
3439         try:
3440             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3441         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3442             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3443             return
3444
3445         # Extract update date
3446         upload_date = None
3447         pattern = 'title="Timestamp">(.*?)</a>'
3448         mobj = re.search(pattern, webpage)
3449         if mobj:
3450             upload_date = mobj.group(1)
3451             # Convert timestring to a format suitable for filename
3452             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3453             upload_date = upload_date.strftime('%Y%m%d')
3454         self.report_date(upload_date)
3455
3456         # Extract uploader
3457         uploader = None
3458         pattern = r'rel\="author".*?>(.*?)</a>'
3459         mobj = re.search(pattern, webpage)
3460         if mobj:
3461             uploader = mobj.group(1)
3462         self.report_uploader(uploader)
3463
3464         # Extract title
3465         # Get the first line for title
3466         video_title = u'NA'
3467         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3468         mobj = re.search(pattern, webpage)
3469         if mobj:
3470             video_title = mobj.group(1)
3471         self.report_title(video_title)
3472
3473         # Step 2, Stimulate clicking the image box to launch video
3474         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3475         mobj = re.search(pattern, webpage)
3476         if mobj is None:
3477             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3478
3479         video_page = mobj.group(1)
3480         request = compat_urllib_request.Request(video_page)
3481         try:
3482             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3483         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3484             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3485             return
3486         self.report_extract_vid_page(video_page)
3487
3488
3489         # Extract video links on video page
3490         """Extract video links of all sizes"""
3491         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3492         mobj = re.findall(pattern, webpage)
3493         if len(mobj) == 0:
3494             self._downloader.trouble(u'ERROR: unable to extract video links')
3495
3496         # Sort in resolution
3497         links = sorted(mobj)
3498
3499         # Choose the lowest of the sort, i.e. highest resolution
3500         video_url = links[-1]
3501         # Only get the url. The resolution part in the tuple has no use anymore
3502         video_url = video_url[-1]
3503         # Treat escaped \u0026 style hex
3504         try:
3505             video_url = video_url.decode("unicode_escape")
3506         except AttributeError: # Python 3
3507             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3508
3509
3510         return [{
3511             'id':       video_id,
3512             'url':      video_url,
3513             'uploader': uploader,
3514             'upload_date':  upload_date,
3515             'title':    video_title,
3516             'ext':      video_extension,
3517         }]
3518
3519 class NBAIE(InfoExtractor):
3520     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3521     IE_NAME = u'nba'
3522
3523     def report_extraction(self, video_id):
3524         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3525
3526     def _real_extract(self, url):
3527         mobj = re.match(self._VALID_URL, url)
3528         if mobj is None:
3529             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3530             return
3531
3532         video_id = mobj.group(1)
3533         if video_id.endswith('/index.html'):
3534             video_id = video_id[:-len('/index.html')]
3535
3536         self.report_extraction(video_id)
3537         try:
3538             urlh = compat_urllib_request.urlopen(url)
3539             webpage_bytes = urlh.read()
3540             webpage = webpage_bytes.decode('utf-8', 'ignore')
3541         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3542             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3543             return
3544
3545         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3546         def _findProp(rexp, default=None):
3547             m = re.search(rexp, webpage)
3548             if m:
3549                 return unescapeHTML(m.group(1))
3550             else:
3551                 return default
3552
3553         shortened_video_id = video_id.rpartition('/')[2]
3554         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3555         info = {
3556             'id': shortened_video_id,
3557             'url': video_url,
3558             'ext': 'mp4',
3559             'title': title,
3560             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3561             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3562         }
3563         return [info]
3564
3565 class JustinTVIE(InfoExtractor):
3566     """Information extractor for justin.tv and twitch.tv"""
3567     # TODO: One broadcast may be split into multiple videos. The key
3568     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3569     # starts at 1 and increases. Can we treat all parts as one video?
3570
3571     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3572         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3573     _JUSTIN_PAGE_LIMIT = 100
3574     IE_NAME = u'justin.tv'
3575
3576     def report_extraction(self, file_id):
3577         """Report information extraction."""
3578         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3579
3580     def report_download_page(self, channel, offset):
3581         """Report attempt to download a single page of videos."""
3582         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3583                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3584
3585     # Return count of items, list of *valid* items
3586     def _parse_page(self, url):
3587         try:
3588             urlh = compat_urllib_request.urlopen(url)
3589             webpage_bytes = urlh.read()
3590             webpage = webpage_bytes.decode('utf-8', 'ignore')
3591         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3592             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3593             return
3594
3595         response = json.loads(webpage)
3596         info = []
3597         for clip in response:
3598             video_url = clip['video_file_url']
3599             if video_url:
3600                 video_extension = os.path.splitext(video_url)[1][1:]
3601                 video_date = re.sub('-', '', clip['created_on'][:10])
3602                 info.append({
3603                     'id': clip['id'],
3604                     'url': video_url,
3605                     'title': clip['title'],
3606                     'uploader': clip.get('user_id', clip.get('channel_id')),
3607                     'upload_date': video_date,
3608                     'ext': video_extension,
3609                 })
3610         return (len(response), info)
3611
3612     def _real_extract(self, url):
3613         mobj = re.match(self._VALID_URL, url)
3614         if mobj is None:
3615             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3616             return
3617
3618         api = 'http://api.justin.tv'
3619         video_id = mobj.group(mobj.lastindex)
3620         paged = False
3621         if mobj.lastindex == 1:
3622             paged = True
3623             api += '/channel/archives/%s.json'
3624         else:
3625             api += '/clip/show/%s.json'
3626         api = api % (video_id,)
3627
3628         self.report_extraction(video_id)
3629
3630         info = []
3631         offset = 0
3632         limit = self._JUSTIN_PAGE_LIMIT
3633         while True:
3634             if paged:
3635                 self.report_download_page(video_id, offset)
3636             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3637             page_count, page_info = self._parse_page(page_url)
3638             info.extend(page_info)
3639             if not paged or page_count != limit:
3640                 break
3641             offset += limit
3642         return info
3643
3644 class FunnyOrDieIE(InfoExtractor):
3645     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3646
3647     def report_extraction(self, video_id):
3648         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3649
3650     def _real_extract(self, url):
3651         mobj = re.match(self._VALID_URL, url)
3652         if mobj is None:
3653             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3654             return
3655
3656         video_id = mobj.group('id')
3657         self.report_extraction(video_id)
3658         try:
3659             urlh = compat_urllib_request.urlopen(url)
3660             webpage_bytes = urlh.read()
3661             webpage = webpage_bytes.decode('utf-8', 'ignore')
3662         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3663             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3664             return
3665
3666         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3667         if not m:
3668             self._downloader.trouble(u'ERROR: unable to find video information')
3669         video_url = unescapeHTML(m.group('url'))
3670
3671         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3672         if not m:
3673             self._downloader.trouble(u'Cannot find video title')
3674         title = unescapeHTML(m.group('title'))
3675
3676         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3677         if m:
3678             desc = unescapeHTML(m.group('desc'))
3679         else:
3680             desc = None
3681
3682         info = {
3683             'id': video_id,
3684             'url': video_url,
3685             'ext': 'mp4',
3686             'title': title,
3687             'description': desc,
3688         }
3689         return [info]
3690
3691 class TweetReelIE(InfoExtractor):
3692     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3693
3694     def report_extraction(self, video_id):
3695         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3696
3697     def _real_extract(self, url):
3698         mobj = re.match(self._VALID_URL, url)
3699         if mobj is None:
3700             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3701             return
3702
3703         video_id = mobj.group('id')
3704         self.report_extraction(video_id)
3705         try:
3706             urlh = compat_urllib_request.urlopen(url)
3707             webpage_bytes = urlh.read()
3708             webpage = webpage_bytes.decode('utf-8', 'ignore')
3709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3710             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3711             return
3712
3713         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3714         if not m:
3715             self._downloader.trouble(u'ERROR: Cannot find status ID')
3716         status_id = m.group(1)
3717
3718         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3719         if not m:
3720             self._downloader.trouble(u'WARNING: Cannot find description')
3721         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3722
3723         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3724         if not m:
3725             self._downloader.trouble(u'ERROR: Cannot find uploader')
3726         uploader = unescapeHTML(m.group('uploader'))
3727         uploader_id = unescapeHTML(m.group('uploader_id'))
3728
3729         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3730         if not m:
3731             self._downloader.trouble(u'ERROR: Cannot find upload date')
3732         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3733
3734         title = desc
3735         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3736
3737         info = {
3738             'id': video_id,
3739             'url': video_url,
3740             'ext': 'mov',
3741             'title': title,
3742             'description': desc,
3743             'uploader': uploader,
3744             'uploader_id': uploader_id,
3745             'internal_id': status_id,
3746             'upload_date': upload_date
3747         }
3748         return [info]
3749
3750 class SteamIE(InfoExtractor):
3751     _VALID_URL = r"""http://store.steampowered.com/
3752                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3753                 (?P<gameID>\d+)/?
3754                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3755                 """
3756
3757     def suitable(self, url):
3758         """Receives a URL and returns True if suitable for this IE."""
3759         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3760
3761     def report_download_video_page(self, game_id):
3762         self._downloader.to_screen(u'[%s] %s: Downloading video page' % (self.IE_NAME, game_id))
3763
3764     def _real_extract(self, url):
3765         m = re.match(self._VALID_URL, url, re.VERBOSE)
3766         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3767         gameID = m.group('gameID')
3768         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3769         try:
3770             self.report_download_video_page(gameID)
3771             urlh = compat_urllib_request.urlopen(videourl)
3772             webpage_bytes = urlh.read()
3773             webpage = webpage_bytes.decode('utf-8', 'ignore')
3774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3775             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3776             return
3777         mweb = re.finditer(urlRE, webpage)
3778         namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3779         titles = list(re.finditer(namesRE, webpage))
3780         videos = []
3781         i = 0
3782         for vid in mweb:
3783             video_id = vid.group('videoID')
3784             title = titles[i].group('videoName')
3785             video_url=vid.group('videoURL')
3786             if not video_url:
3787                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3788             i += 1
3789             info = {
3790                 'id':video_id,
3791                 'url':video_url,
3792                 'ext': 'flv',
3793                 'title': title
3794                   }
3795             videos.append(info)
3796         return videos
3797
3798 class UstreamIE(InfoExtractor):
3799     _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3800     IE_NAME = u'ustream'
3801
3802     def _real_extract(self, url):
3803         m = re.match(self._VALID_URL, url)
3804         video_id = m.group('videoID')
3805         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3806         try:
3807             urlh = compat_urllib_request.urlopen(url)
3808             webpage_bytes = urlh.read()
3809             webpage = webpage_bytes.decode('utf-8', 'ignore')
3810         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3811             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3812             return
3813         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3814         title = m.group('title')
3815         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3816         uploader = m.group('uploader')
3817         info = {
3818                 'id':video_id,
3819                 'url':video_url,
3820                 'ext': 'flv',
3821                 'title': title,
3822                 'uploader': uploader
3823                   }
3824         return [info]
3825
3826
3827 def gen_extractors():
3828     """ Return a list of an instance of every supported extractor.
3829     The order does matter; the first extractor matched is the one handling the URL.
3830     """
3831     return [
3832         YoutubePlaylistIE(),
3833         YoutubeChannelIE(),
3834         YoutubeUserIE(),
3835         YoutubeSearchIE(),
3836         YoutubeIE(),
3837         MetacafeIE(),
3838         DailymotionIE(),
3839         GoogleSearchIE(),
3840         PhotobucketIE(),
3841         YahooIE(),
3842         YahooSearchIE(),
3843         DepositFilesIE(),
3844         FacebookIE(),
3845         BlipTVUserIE(),
3846         BlipTVIE(),
3847         VimeoIE(),
3848         MyVideoIE(),
3849         ComedyCentralIE(),
3850         EscapistIE(),
3851         CollegeHumorIE(),
3852         XVideosIE(),
3853         SoundcloudIE(),
3854         InfoQIE(),
3855         MixcloudIE(),
3856         StanfordOpenClassroomIE(),
3857         MTVIE(),
3858         YoukuIE(),
3859         XNXXIE(),
3860         GooglePlusIE(),
3861         ArteTvIE(),
3862         NBAIE(),
3863         JustinTVIE(),
3864         FunnyOrDieIE(),
3865         TweetReelIE(),
3866         SteamIE(),
3867         UstreamIE(),
3868         GenericIE()
3869     ]
3870
3871