_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import datetime
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import email.utils
  13 import xml.etree.ElementTree
  14 import random
  15 import math
  16
  17 from .utils import *
  18
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37     uploader:       Full name of the video uploader.
  38     upload_date:    Video upload date (YYYYMMDD).
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader_id:    Nickname or id of the video uploader.
  46     player_url:     SWF Player URL (used for rtmpdump).
  47     subtitles:      The .srt file contents.
  48     urlhandle:      [internal] The urlHandle to be used to download the file,
  49                     like returned by urllib.request.urlopen
  50
  51     The fields should all be Unicode strings.
  52
  53     Subclasses of this one should re-define the _real_initialize() and
  54     _real_extract() methods and define a _VALID_URL regexp.
  55     Probably, they should also be added to the list of extractors.
  56
  57     _real_extract() must return a *list* of information dictionaries as
  58     described above.
  59
  60     Finally, the _WORKING attribute should be set to False for broken IEs
  61     in order to warn the users and skip the tests.
  62     """
  63
  64     _ready = False
  65     _downloader = None
  66     _WORKING = True
  67
  68     def __init__(self, downloader=None):
  69         """Constructor. Receives an optional downloader."""
  70         self._ready = False
  71         self.set_downloader(downloader)
  72
  73     def suitable(self, url):
  74         """Receives a URL and returns True if suitable for this IE."""
  75         return re.match(self._VALID_URL, url) is not None
  76
  77     def working(self):
  78         """Getter method for _WORKING."""
  79         return self._WORKING
  80
  81     def initialize(self):
  82         """Initializes an instance (authentication, etc)."""
  83         if not self._ready:
  84             self._real_initialize()
  85             self._ready = True
  86
  87     def extract(self, url):
  88         """Extracts URL information and returns it in list of dicts."""
  89         self.initialize()
  90         return self._real_extract(url)
  91
  92     def set_downloader(self, downloader):
  93         """Sets the downloader for this IE."""
  94         self._downloader = downloader
  95
  96     def _real_initialize(self):
  97         """Real initialization process. Redefine in subclasses."""
  98         pass
  99
 100     def _real_extract(self, url):
 101         """Real extraction process. Redefine in subclasses."""
 102         pass
 103
 104     @property
 105     def IE_NAME(self):
 106         return type(self).__name__[:-2]
 107
 108 class YoutubeIE(InfoExtractor):
 109     """Information extractor for youtube.com."""
 110
 111     _VALID_URL = r"""^
 112                      (
 113                          (?:https?://)?                                       # http(s):// (optional)
 114                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 115                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 116                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 117                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 118                          (?:                                                  # the various things that can precede the ID:
 119                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 120                              |(?:                                             # or the v= param in all its forms
 121                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 122                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 123                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 124                                  v=
 125                              )
 126                          )?                                                   # optional -> youtube.com/xxxx is OK
 127                      )?                                                       # all until now is optional -> you can pass the naked ID
 128                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 129                      (?(1).+)?                                                # if we found the ID, everything can follow
 130                      $"""
 131     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 132     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 133     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 134     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 135     _NETRC_MACHINE = 'youtube'
 136     # Listed in order of quality
 137     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 138     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 139     _video_extensions = {
 140         '13': '3gp',
 141         '17': 'mp4',
 142         '18': 'mp4',
 143         '22': 'mp4',
 144         '37': 'mp4',
 145         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 146         '43': 'webm',
 147         '44': 'webm',
 148         '45': 'webm',
 149         '46': 'webm',
 150     }
 151     _video_dimensions = {
 152         '5': '240x400',
 153         '6': '???',
 154         '13': '???',
 155         '17': '144x176',
 156         '18': '360x640',
 157         '22': '720x1280',
 158         '34': '360x640',
 159         '35': '480x854',
 160         '37': '1080x1920',
 161         '38': '3072x4096',
 162         '43': '360x640',
 163         '44': '480x854',
 164         '45': '720x1280',
 165         '46': '1080x1920',
 166     }
 167     IE_NAME = u'youtube'
 168
 169     def suitable(self, url):
 170         """Receives a URL and returns True if suitable for this IE."""
 171         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 172
 173     def report_lang(self):
 174         """Report attempt to set language."""
 175         self._downloader.to_screen(u'[youtube] Setting language')
 176
 177     def report_login(self):
 178         """Report attempt to log in."""
 179         self._downloader.to_screen(u'[youtube] Logging in')
 180
 181     def report_age_confirmation(self):
 182         """Report attempt to confirm age."""
 183         self._downloader.to_screen(u'[youtube] Confirming age')
 184
 185     def report_video_webpage_download(self, video_id):
 186         """Report attempt to download video webpage."""
 187         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 188
 189     def report_video_info_webpage_download(self, video_id):
 190         """Report attempt to download video info webpage."""
 191         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 192
 193     def report_video_subtitles_download(self, video_id):
 194         """Report attempt to download video info webpage."""
 195         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 196
 197     def report_information_extraction(self, video_id):
 198         """Report attempt to extract video information."""
 199         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 200
 201     def report_unavailable_format(self, video_id, format):
 202         """Report extracted video URL."""
 203         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 204
 205     def report_rtmp_download(self):
 206         """Indicate the download will use the RTMP protocol."""
 207         self._downloader.to_screen(u'[youtube] RTMP download detected')
 208
 209     def _closed_captions_xml_to_srt(self, xml_string):
 210         srt = ''
 211         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 212         # TODO parse xml instead of regex
 213         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 214             if not dur: dur = '4'
 215             start = float(start)
 216             end = start + float(dur)
 217             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 218             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 219             caption = unescapeHTML(caption)
 220             caption = unescapeHTML(caption) # double cycle, intentional
 221             srt += str(n+1) + '\n'
 222             srt += start + ' --> ' + end + '\n'
 223             srt += caption + '\n\n'
 224         return srt
 225
 226     def _extract_subtitles(self, video_id):
 227         self.report_video_subtitles_download(video_id)
 228         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 229         try:
 230             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 232             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 233         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 234         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 235         if not srt_lang_list:
 236             return (u'WARNING: video has no closed captions', None)
 237         if self._downloader.params.get('subtitleslang', False):
 238             srt_lang = self._downloader.params.get('subtitleslang')
 239         elif 'en' in srt_lang_list:
 240             srt_lang = 'en'
 241         else:
 242             srt_lang = list(srt_lang_list.keys())[0]
 243         if not srt_lang in srt_lang_list:
 244             return (u'WARNING: no closed captions found in the specified language', None)
 245         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 246         try:
 247             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 249             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 250         if not srt_xml:
 251             return (u'WARNING: unable to download video subtitles', None)
 252         return (None, self._closed_captions_xml_to_srt(srt_xml))
 253
 254     def _print_formats(self, formats):
 255         print('Available formats:')
 256         for x in formats:
 257             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 258
 259     def _real_initialize(self):
 260         if self._downloader is None:
 261             return
 262
 263         username = None
 264         password = None
 265         downloader_params = self._downloader.params
 266
 267         # Attempt to use provided username and password or .netrc data
 268         if downloader_params.get('username', None) is not None:
 269             username = downloader_params['username']
 270             password = downloader_params['password']
 271         elif downloader_params.get('usenetrc', False):
 272             try:
 273                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 274                 if info is not None:
 275                     username = info[0]
 276                     password = info[2]
 277                 else:
 278                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 279             except (IOError, netrc.NetrcParseError) as err:
 280                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 281                 return
 282
 283         # Set language
 284         request = compat_urllib_request.Request(self._LANG_URL)
 285         try:
 286             self.report_lang()
 287             compat_urllib_request.urlopen(request).read()
 288         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 289             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 290             return
 291
 292         # No authentication to be performed
 293         if username is None:
 294             return
 295
 296         # Log in
 297         login_form = {
 298                 'current_form': 'loginForm',
 299                 'next':     '/',
 300                 'action_login': 'Log In',
 301                 'username': username,
 302                 'password': password,
 303                 }
 304         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 305         try:
 306             self.report_login()
 307             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 308             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 309                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 310                 return
 311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 312             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 313             return
 314
 315         # Confirm age
 316         age_form = {
 317                 'next_url':     '/',
 318                 'action_confirm':   'Confirm',
 319                 }
 320         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 321         try:
 322             self.report_age_confirmation()
 323             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 326             return
 327
 328     def _extract_id(self, url):
 329         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 330         if mobj is None:
 331             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 332             return
 333         video_id = mobj.group(2)
 334         return video_id
 335
 336     def _real_extract(self, url):
 337         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 338         mobj = re.search(self._NEXT_URL_RE, url)
 339         if mobj:
 340             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 341         video_id = self._extract_id(url)
 342
 343         # Get video webpage
 344         self.report_video_webpage_download(video_id)
 345         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 346         request = compat_urllib_request.Request(url)
 347         try:
 348             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 349         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 350             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 351             return
 352
 353         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 354
 355         # Attempt to extract SWF player URL
 356         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 357         if mobj is not None:
 358             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 359         else:
 360             player_url = None
 361
 362         # Get video info
 363         self.report_video_info_webpage_download(video_id)
 364         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 365             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 366                     % (video_id, el_type))
 367             request = compat_urllib_request.Request(video_info_url)
 368             try:
 369                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 370                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 371                 video_info = compat_parse_qs(video_info_webpage)
 372                 if 'token' in video_info:
 373                     break
 374             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 376                 return
 377         if 'token' not in video_info:
 378             if 'reason' in video_info:
 379                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 380             else:
 381                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 382             return
 383
 384         # Check for "rental" videos
 385         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 386             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 387             return
 388
 389         # Start extracting information
 390         self.report_information_extraction(video_id)
 391
 392         # uploader
 393         if 'author' not in video_info:
 394             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 395             return
 396         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 397
 398         # uploader_id
 399         video_uploader_id = None
 400         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
 401         if mobj is not None:
 402             video_uploader_id = mobj.group(1)
 403         else:
 404             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 405
 406         # title
 407         if 'title' not in video_info:
 408             self._downloader.trouble(u'ERROR: unable to extract video title')
 409             return
 410         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 411
 412         # thumbnail image
 413         if 'thumbnail_url' not in video_info:
 414             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 415             video_thumbnail = ''
 416         else:   # don't panic if we can't find it
 417             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 418
 419         # upload date
 420         upload_date = None
 421         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 422         if mobj is not None:
 423             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 424             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 425             for expression in format_expressions:
 426                 try:
 427                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 428                 except:
 429                     pass
 430
 431         # description
 432         video_description = get_element_by_id("eow-description", video_webpage)
 433         if video_description:
 434             video_description = clean_html(video_description)
 435         else:
 436             video_description = ''
 437
 438         # closed captions
 439         video_subtitles = None
 440         if self._downloader.params.get('writesubtitles', False):
 441             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 442             if srt_error:
 443                 self._downloader.trouble(srt_error)
 444
 445         if 'length_seconds' not in video_info:
 446             self._downloader.trouble(u'WARNING: unable to extract video duration')
 447             video_duration = ''
 448         else:
 449             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 450
 451         # token
 452         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 453
 454         # Decide which formats to download
 455         req_format = self._downloader.params.get('format', None)
 456
 457         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 458             self.report_rtmp_download()
 459             video_url_list = [(None, video_info['conn'][0])]
 460         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 461             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 462             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 463             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 464             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 465
 466             format_limit = self._downloader.params.get('format_limit', None)
 467             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 468             if format_limit is not None and format_limit in available_formats:
 469                 format_list = available_formats[available_formats.index(format_limit):]
 470             else:
 471                 format_list = available_formats
 472             existing_formats = [x for x in format_list if x in url_map]
 473             if len(existing_formats) == 0:
 474                 self._downloader.trouble(u'ERROR: no known formats available for video')
 475                 return
 476             if self._downloader.params.get('listformats', None):
 477                 self._print_formats(existing_formats)
 478                 return
 479             if req_format is None or req_format == 'best':
 480                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 481             elif req_format == 'worst':
 482                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 483             elif req_format in ('-1', 'all'):
 484                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 485             else:
 486                 # Specific formats. We pick the first in a slash-delimeted sequence.
 487                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 488                 req_formats = req_format.split('/')
 489                 video_url_list = None
 490                 for rf in req_formats:
 491                     if rf in url_map:
 492                         video_url_list = [(rf, url_map[rf])]
 493                         break
 494                 if video_url_list is None:
 495                     self._downloader.trouble(u'ERROR: requested format not available')
 496                     return
 497         else:
 498             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 499             return
 500
 501         results = []
 502         for format_param, video_real_url in video_url_list:
 503             # Extension
 504             video_extension = self._video_extensions.get(format_param, 'flv')
 505
 506             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 507                                               self._video_dimensions.get(format_param, '???'))
 508
 509             results.append({
 510                 'id':       video_id,
 511                 'url':      video_real_url,
 512                 'uploader': video_uploader,
 513                 'uploader_id': video_uploader_id,
 514                 'upload_date':  upload_date,
 515                 'title':    video_title,
 516                 'ext':      video_extension,
 517                 'format':   video_format,
 518                 'thumbnail':    video_thumbnail,
 519                 'description':  video_description,
 520                 'player_url':   player_url,
 521                 'subtitles':    video_subtitles,
 522                 'duration':     video_duration
 523             })
 524         return results
 525
 526
 527 class MetacafeIE(InfoExtractor):
 528     """Information Extractor for metacafe.com."""
 529
 530     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 531     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 532     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 533     IE_NAME = u'metacafe'
 534
 535     def __init__(self, downloader=None):
 536         InfoExtractor.__init__(self, downloader)
 537
 538     def report_disclaimer(self):
 539         """Report disclaimer retrieval."""
 540         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 541
 542     def report_age_confirmation(self):
 543         """Report attempt to confirm age."""
 544         self._downloader.to_screen(u'[metacafe] Confirming age')
 545
 546     def report_download_webpage(self, video_id):
 547         """Report webpage download."""
 548         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 549
 550     def report_extraction(self, video_id):
 551         """Report information extraction."""
 552         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 553
 554     def _real_initialize(self):
 555         # Retrieve disclaimer
 556         request = compat_urllib_request.Request(self._DISCLAIMER)
 557         try:
 558             self.report_disclaimer()
 559             disclaimer = compat_urllib_request.urlopen(request).read()
 560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 561             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 562             return
 563
 564         # Confirm age
 565         disclaimer_form = {
 566             'filters': '0',
 567             'submit': "Continue - I'm over 18",
 568             }
 569         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 570         try:
 571             self.report_age_confirmation()
 572             disclaimer = compat_urllib_request.urlopen(request).read()
 573         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 574             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 575             return
 576
 577     def _real_extract(self, url):
 578         # Extract id and simplified title from URL
 579         mobj = re.match(self._VALID_URL, url)
 580         if mobj is None:
 581             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 582             return
 583
 584         video_id = mobj.group(1)
 585
 586         # Check if video comes from YouTube
 587         mobj2 = re.match(r'^yt-(.*)$', video_id)
 588         if mobj2 is not None:
 589             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 590             return
 591
 592         # Retrieve video webpage to extract further information
 593         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 594         try:
 595             self.report_download_webpage(video_id)
 596             webpage = compat_urllib_request.urlopen(request).read()
 597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 598             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 599             return
 600
 601         # Extract URL, uploader and title from webpage
 602         self.report_extraction(video_id)
 603         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 604         if mobj is not None:
 605             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 606             video_extension = mediaURL[-3:]
 607
 608             # Extract gdaKey if available
 609             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 610             if mobj is None:
 611                 video_url = mediaURL
 612             else:
 613                 gdaKey = mobj.group(1)
 614                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 615         else:
 616             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 617             if mobj is None:
 618                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 619                 return
 620             vardict = compat_parse_qs(mobj.group(1))
 621             if 'mediaData' not in vardict:
 622                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 623                 return
 624             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 625             if mobj is None:
 626                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 627                 return
 628             mediaURL = mobj.group(1).replace('\\/', '/')
 629             video_extension = mediaURL[-3:]
 630             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 631
 632         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 633         if mobj is None:
 634             self._downloader.trouble(u'ERROR: unable to extract title')
 635             return
 636         video_title = mobj.group(1).decode('utf-8')
 637
 638         mobj = re.search(r'submitter=(.*?);', webpage)
 639         if mobj is None:
 640             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 641             return
 642         video_uploader = mobj.group(1)
 643
 644         return [{
 645             'id':       video_id.decode('utf-8'),
 646             'url':      video_url.decode('utf-8'),
 647             'uploader': video_uploader.decode('utf-8'),
 648             'upload_date':  None,
 649             'title':    video_title,
 650             'ext':      video_extension.decode('utf-8'),
 651         }]
 652
 653
 654 class DailymotionIE(InfoExtractor):
 655     """Information Extractor for Dailymotion"""
 656
 657     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 658     IE_NAME = u'dailymotion'
 659
 660     def __init__(self, downloader=None):
 661         InfoExtractor.__init__(self, downloader)
 662
 663     def report_download_webpage(self, video_id):
 664         """Report webpage download."""
 665         self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 666
 667     def report_extraction(self, video_id):
 668         """Report information extraction."""
 669         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 670
 671     def _real_extract(self, url):
 672         # Extract id and simplified title from URL
 673         mobj = re.match(self._VALID_URL, url)
 674         if mobj is None:
 675             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 676             return
 677
 678         video_id = mobj.group(1).split('_')[0].split('?')[0]
 679
 680         video_extension = 'mp4'
 681
 682         # Retrieve video webpage to extract further information
 683         request = compat_urllib_request.Request(url)
 684         request.add_header('Cookie', 'family_filter=off')
 685         try:
 686             self.report_download_webpage(video_id)
 687             webpage_bytes = compat_urllib_request.urlopen(request).read()
 688             webpage = webpage_bytes.decode('utf-8')
 689         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 690             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 691             return
 692
 693         # Extract URL, uploader and title from webpage
 694         self.report_extraction(video_id)
 695         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 696         if mobj is None:
 697             self._downloader.trouble(u'ERROR: unable to extract media URL')
 698             return
 699         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 700
 701         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 702             if key in flashvars:
 703                 max_quality = key
 704                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 705                 break
 706         else:
 707             self._downloader.trouble(u'ERROR: unable to extract video URL')
 708             return
 709
 710         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 711         if mobj is None:
 712             self._downloader.trouble(u'ERROR: unable to extract video URL')
 713             return
 714
 715         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 716
 717         # TODO: support choosing qualities
 718
 719         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 720         if mobj is None:
 721             self._downloader.trouble(u'ERROR: unable to extract title')
 722             return
 723         video_title = unescapeHTML(mobj.group('title'))
 724
 725         video_uploader = None
 726         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 727         if mobj is None:
 728             # lookin for official user
 729             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 730             if mobj_official is None:
 731                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 732             else:
 733                 video_uploader = mobj_official.group(1)
 734         else:
 735             video_uploader = mobj.group(1)
 736
 737         video_upload_date = None
 738         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 739         if mobj is not None:
 740             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 741
 742         return [{
 743             'id':       video_id,
 744             'url':      video_url,
 745             'uploader': video_uploader,
 746             'upload_date':  video_upload_date,
 747             'title':    video_title,
 748             'ext':      video_extension,
 749         }]
 750
 751
 752 class PhotobucketIE(InfoExtractor):
 753     """Information extractor for photobucket.com."""
 754
 755     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 756     IE_NAME = u'photobucket'
 757
 758     def __init__(self, downloader=None):
 759         InfoExtractor.__init__(self, downloader)
 760
 761     def report_download_webpage(self, video_id):
 762         """Report webpage download."""
 763         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 764
 765     def report_extraction(self, video_id):
 766         """Report information extraction."""
 767         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 768
 769     def _real_extract(self, url):
 770         # Extract id from URL
 771         mobj = re.match(self._VALID_URL, url)
 772         if mobj is None:
 773             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 774             return
 775
 776         video_id = mobj.group(1)
 777
 778         video_extension = 'flv'
 779
 780         # Retrieve video webpage to extract further information
 781         request = compat_urllib_request.Request(url)
 782         try:
 783             self.report_download_webpage(video_id)
 784             webpage = compat_urllib_request.urlopen(request).read()
 785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 786             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 787             return
 788
 789         # Extract URL, uploader, and title from webpage
 790         self.report_extraction(video_id)
 791         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 792         if mobj is None:
 793             self._downloader.trouble(u'ERROR: unable to extract media URL')
 794             return
 795         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 796
 797         video_url = mediaURL
 798
 799         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 800         if mobj is None:
 801             self._downloader.trouble(u'ERROR: unable to extract title')
 802             return
 803         video_title = mobj.group(1).decode('utf-8')
 804
 805         video_uploader = mobj.group(2).decode('utf-8')
 806
 807         return [{
 808             'id':       video_id.decode('utf-8'),
 809             'url':      video_url.decode('utf-8'),
 810             'uploader': video_uploader,
 811             'upload_date':  None,
 812             'title':    video_title,
 813             'ext':      video_extension.decode('utf-8'),
 814         }]
 815
 816
 817 class YahooIE(InfoExtractor):
 818     """Information extractor for video.yahoo.com."""
 819
 820     _WORKING = False
 821     # _VALID_URL matches all Yahoo! Video URLs
 822     # _VPAGE_URL matches only the extractable '/watch/' URLs
 823     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 824     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 825     IE_NAME = u'video.yahoo'
 826
 827     def __init__(self, downloader=None):
 828         InfoExtractor.__init__(self, downloader)
 829
 830     def report_download_webpage(self, video_id):
 831         """Report webpage download."""
 832         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 833
 834     def report_extraction(self, video_id):
 835         """Report information extraction."""
 836         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 837
 838     def _real_extract(self, url, new_video=True):
 839         # Extract ID from URL
 840         mobj = re.match(self._VALID_URL, url)
 841         if mobj is None:
 842             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 843             return
 844
 845         video_id = mobj.group(2)
 846         video_extension = 'flv'
 847
 848         # Rewrite valid but non-extractable URLs as
 849         # extractable English language /watch/ URLs
 850         if re.match(self._VPAGE_URL, url) is None:
 851             request = compat_urllib_request.Request(url)
 852             try:
 853                 webpage = compat_urllib_request.urlopen(request).read()
 854             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 855                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 856                 return
 857
 858             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 859             if mobj is None:
 860                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 861                 return
 862             yahoo_id = mobj.group(1)
 863
 864             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 865             if mobj is None:
 866                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 867                 return
 868             yahoo_vid = mobj.group(1)
 869
 870             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 871             return self._real_extract(url, new_video=False)
 872
 873         # Retrieve video webpage to extract further information
 874         request = compat_urllib_request.Request(url)
 875         try:
 876             self.report_download_webpage(video_id)
 877             webpage = compat_urllib_request.urlopen(request).read()
 878         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 879             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 880             return
 881
 882         # Extract uploader and title from webpage
 883         self.report_extraction(video_id)
 884         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 885         if mobj is None:
 886             self._downloader.trouble(u'ERROR: unable to extract video title')
 887             return
 888         video_title = mobj.group(1).decode('utf-8')
 889
 890         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 891         if mobj is None:
 892             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 893             return
 894         video_uploader = mobj.group(1).decode('utf-8')
 895
 896         # Extract video thumbnail
 897         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 900             return
 901         video_thumbnail = mobj.group(1).decode('utf-8')
 902
 903         # Extract video description
 904         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 905         if mobj is None:
 906             self._downloader.trouble(u'ERROR: unable to extract video description')
 907             return
 908         video_description = mobj.group(1).decode('utf-8')
 909         if not video_description:
 910             video_description = 'No description available.'
 911
 912         # Extract video height and width
 913         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 914         if mobj is None:
 915             self._downloader.trouble(u'ERROR: unable to extract video height')
 916             return
 917         yv_video_height = mobj.group(1)
 918
 919         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 920         if mobj is None:
 921             self._downloader.trouble(u'ERROR: unable to extract video width')
 922             return
 923         yv_video_width = mobj.group(1)
 924
 925         # Retrieve video playlist to extract media URL
 926         # I'm not completely sure what all these options are, but we
 927         # seem to need most of them, otherwise the server sends a 401.
 928         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 929         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 930         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 931                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 932                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 933         try:
 934             self.report_download_webpage(video_id)
 935             webpage = compat_urllib_request.urlopen(request).read()
 936         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 937             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 938             return
 939
 940         # Extract media URL from playlist XML
 941         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 944             return
 945         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 946         video_url = unescapeHTML(video_url)
 947
 948         return [{
 949             'id':       video_id.decode('utf-8'),
 950             'url':      video_url,
 951             'uploader': video_uploader,
 952             'upload_date':  None,
 953             'title':    video_title,
 954             'ext':      video_extension.decode('utf-8'),
 955             'thumbnail':    video_thumbnail.decode('utf-8'),
 956             'description':  video_description,
 957         }]
 958
 959
 960 class VimeoIE(InfoExtractor):
 961     """Information extractor for vimeo.com."""
 962
 963     # _VALID_URL matches Vimeo URLs
 964     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 965     IE_NAME = u'vimeo'
 966
 967     def __init__(self, downloader=None):
 968         InfoExtractor.__init__(self, downloader)
 969
 970     def report_download_webpage(self, video_id):
 971         """Report webpage download."""
 972         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 973
 974     def report_extraction(self, video_id):
 975         """Report information extraction."""
 976         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 977
 978     def _real_extract(self, url, new_video=True):
 979         # Extract ID from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 983             return
 984
 985         video_id = mobj.group(1)
 986
 987         # Retrieve video webpage to extract further information
 988         request = compat_urllib_request.Request(url, None, std_headers)
 989         try:
 990             self.report_download_webpage(video_id)
 991             webpage_bytes = compat_urllib_request.urlopen(request).read()
 992             webpage = webpage_bytes.decode('utf-8')
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Now we begin extracting as much information as we can from what we
 998         # retrieved. First we extract the information common to all extractors,
 999         # and latter we extract those that are Vimeo specific.
1000         self.report_extraction(video_id)
1001
1002         # Extract the config JSON
1003         try:
1004             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005             config = json.loads(config)
1006         except:
1007             self._downloader.trouble(u'ERROR: unable to extract info section')
1008             return
1009
1010         # Extract title
1011         video_title = config["video"]["title"]
1012
1013         # Extract uploader and uploader_id
1014         video_uploader = config["video"]["owner"]["name"]
1015         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1016
1017         # Extract video thumbnail
1018         video_thumbnail = config["video"]["thumbnail"]
1019
1020         # Extract video description
1021         video_description = get_element_by_attribute("itemprop", "description", webpage)
1022         if video_description: video_description = clean_html(video_description)
1023         else: video_description = ''
1024
1025         # Extract upload date
1026         video_upload_date = None
1027         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028         if mobj is not None:
1029             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1030
1031         # Vimeo specific: extract request signature and timestamp
1032         sig = config['request']['signature']
1033         timestamp = config['request']['timestamp']
1034
1035         # Vimeo specific: extract video codec and quality information
1036         # First consider quality, then codecs, then take everything
1037         # TODO bind to format param
1038         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039         files = { 'hd': [], 'sd': [], 'other': []}
1040         for codec_name, codec_extension in codecs:
1041             if codec_name in config["video"]["files"]:
1042                 if 'hd' in config["video"]["files"][codec_name]:
1043                     files['hd'].append((codec_name, codec_extension, 'hd'))
1044                 elif 'sd' in config["video"]["files"][codec_name]:
1045                     files['sd'].append((codec_name, codec_extension, 'sd'))
1046                 else:
1047                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1048
1049         for quality in ('hd', 'sd', 'other'):
1050             if len(files[quality]) > 0:
1051                 video_quality = files[quality][0][2]
1052                 video_codec = files[quality][0][0]
1053                 video_extension = files[quality][0][1]
1054                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1055                 break
1056         else:
1057             self._downloader.trouble(u'ERROR: no known codec found')
1058             return
1059
1060         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1062
1063         return [{
1064             'id':       video_id,
1065             'url':      video_url,
1066             'uploader': video_uploader,
1067             'uploader_id': video_uploader_id,
1068             'upload_date':  video_upload_date,
1069             'title':    video_title,
1070             'ext':      video_extension,
1071             'thumbnail':    video_thumbnail,
1072             'description':  video_description,
1073         }]
1074
1075
1076 class ArteTvIE(InfoExtractor):
1077     """arte.tv information extractor."""
1078
1079     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080     _LIVE_URL = r'index-[0-9]+\.html$'
1081
1082     IE_NAME = u'arte.tv'
1083
1084     def __init__(self, downloader=None):
1085         InfoExtractor.__init__(self, downloader)
1086
1087     def report_download_webpage(self, video_id):
1088         """Report webpage download."""
1089         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1090
1091     def report_extraction(self, video_id):
1092         """Report information extraction."""
1093         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1094
1095     def fetch_webpage(self, url):
1096         request = compat_urllib_request.Request(url)
1097         try:
1098             self.report_download_webpage(url)
1099             webpage = compat_urllib_request.urlopen(request).read()
1100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1102             return
1103         except ValueError as err:
1104             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1105             return
1106         return webpage
1107
1108     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109         page = self.fetch_webpage(url)
1110         mobj = re.search(regex, page, regexFlags)
1111         info = {}
1112
1113         if mobj is None:
1114             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115             return
1116
1117         for (i, key, err) in matchTuples:
1118             if mobj.group(i) is None:
1119                 self._downloader.trouble(err)
1120                 return
1121             else:
1122                 info[key] = mobj.group(i)
1123
1124         return info
1125
1126     def extractLiveStream(self, url):
1127         video_lang = url.split('/')[-4]
1128         info = self.grep_webpage(
1129             url,
1130             r'src="(.*?/videothek_js.*?\.js)',
1131             0,
1132             [
1133                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1134             ]
1135         )
1136         http_host = url.split('/')[2]
1137         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138         info = self.grep_webpage(
1139             next_url,
1140             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141                 '(http://.*?\.swf).*?' +
1142                 '(rtmp://.*?)\'',
1143             re.DOTALL,
1144             [
1145                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1146                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1148             ]
1149         )
1150         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1151
1152     def extractPlus7Stream(self, url):
1153         video_lang = url.split('/')[-3]
1154         info = self.grep_webpage(
1155             url,
1156             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1157             0,
1158             [
1159                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1160             ]
1161         )
1162         next_url = compat_urllib_parse.unquote(info.get('url'))
1163         info = self.grep_webpage(
1164             next_url,
1165             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1166             0,
1167             [
1168                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1169             ]
1170         )
1171         next_url = compat_urllib_parse.unquote(info.get('url'))
1172
1173         info = self.grep_webpage(
1174             next_url,
1175             r'<video id="(.*?)".*?>.*?' +
1176                 '<name>(.*?)</name>.*?' +
1177                 '<dateVideo>(.*?)</dateVideo>.*?' +
1178                 '<url quality="hd">(.*?)</url>',
1179             re.DOTALL,
1180             [
1181                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1182                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1184                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1185             ]
1186         )
1187
1188         return {
1189             'id':           info.get('id'),
1190             'url':          compat_urllib_parse.unquote(info.get('url')),
1191             'uploader':     u'arte.tv',
1192             'upload_date':  info.get('date'),
1193             'title':        info.get('title').decode('utf-8'),
1194             'ext':          u'mp4',
1195             'format':       u'NA',
1196             'player_url':   None,
1197         }
1198
1199     def _real_extract(self, url):
1200         video_id = url.split('/')[-1]
1201         self.report_extraction(video_id)
1202
1203         if re.search(self._LIVE_URL, video_id) is not None:
1204             self.extractLiveStream(url)
1205             return
1206         else:
1207             info = self.extractPlus7Stream(url)
1208
1209         return [info]
1210
1211
1212 class GenericIE(InfoExtractor):
1213     """Generic last-resort information extractor."""
1214
1215     _VALID_URL = r'.*'
1216     IE_NAME = u'generic'
1217
1218     def __init__(self, downloader=None):
1219         InfoExtractor.__init__(self, downloader)
1220
1221     def report_download_webpage(self, video_id):
1222         """Report webpage download."""
1223         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1225
1226     def report_extraction(self, video_id):
1227         """Report information extraction."""
1228         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1229
1230     def report_following_redirect(self, new_url):
1231         """Report information extraction."""
1232         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1233
1234     def _test_redirect(self, url):
1235         """Check if it is a redirect, like url shorteners, in case restart chain."""
1236         class HeadRequest(compat_urllib_request.Request):
1237             def get_method(self):
1238                 return "HEAD"
1239
1240         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1241             """
1242             Subclass the HTTPRedirectHandler to make it use our
1243             HeadRequest also on the redirected URL
1244             """
1245             def redirect_request(self, req, fp, code, msg, headers, newurl):
1246                 if code in (301, 302, 303, 307):
1247                     newurl = newurl.replace(' ', '%20')
1248                     newheaders = dict((k,v) for k,v in req.headers.items()
1249                                       if k.lower() not in ("content-length", "content-type"))
1250                     return HeadRequest(newurl,
1251                                        headers=newheaders,
1252                                        origin_req_host=req.get_origin_req_host(),
1253                                        unverifiable=True)
1254                 else:
1255                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1256
1257         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1258             """
1259             Fallback to GET if HEAD is not allowed (405 HTTP error)
1260             """
1261             def http_error_405(self, req, fp, code, msg, headers):
1262                 fp.read()
1263                 fp.close()
1264
1265                 newheaders = dict((k,v) for k,v in req.headers.items()
1266                                   if k.lower() not in ("content-length", "content-type"))
1267                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1268                                                  headers=newheaders,
1269                                                  origin_req_host=req.get_origin_req_host(),
1270                                                  unverifiable=True))
1271
1272         # Build our opener
1273         opener = compat_urllib_request.OpenerDirector()
1274         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275                         HTTPMethodFallback, HEADRedirectHandler,
1276                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277             opener.add_handler(handler())
1278
1279         response = opener.open(HeadRequest(url))
1280         new_url = response.geturl()
1281
1282         if url == new_url:
1283             return False
1284
1285         self.report_following_redirect(new_url)
1286         self._downloader.download([new_url])
1287         return True
1288
1289     def _real_extract(self, url):
1290         if self._test_redirect(url): return
1291
1292         video_id = url.split('/')[-1]
1293         request = compat_urllib_request.Request(url)
1294         try:
1295             self.report_download_webpage(video_id)
1296             webpage = compat_urllib_request.urlopen(request).read()
1297         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1299             return
1300         except ValueError as err:
1301             # since this is the last-resort InfoExtractor, if
1302             # this error is thrown, it'll be thrown here
1303             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1304             return
1305
1306         self.report_extraction(video_id)
1307         # Start with something easy: JW Player in SWFObject
1308         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1309         if mobj is None:
1310             # Broaden the search a little bit
1311             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1312         if mobj is None:
1313             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314             return
1315
1316         # It's possible that one of the regexes
1317         # matched, but returned an empty group:
1318         if mobj.group(1) is None:
1319             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320             return
1321
1322         video_url = compat_urllib_parse.unquote(mobj.group(1))
1323         video_id = os.path.basename(video_url)
1324
1325         # here's a fun little line of code for you:
1326         video_extension = os.path.splitext(video_id)[1][1:]
1327         video_id = os.path.splitext(video_id)[0]
1328
1329         # it's tempting to parse this further, but you would
1330         # have to take into account all the variations like
1331         #   Video Title - Site Name
1332         #   Site Name | Video Title
1333         #   Video Title - Tagline | Site Name
1334         # and so on and so forth; it's just not practical
1335         mobj = re.search(r'<title>(.*)</title>', webpage)
1336         if mobj is None:
1337             self._downloader.trouble(u'ERROR: unable to extract title')
1338             return
1339         video_title = mobj.group(1)
1340
1341         # video uploader is domain name
1342         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1343         if mobj is None:
1344             self._downloader.trouble(u'ERROR: unable to extract title')
1345             return
1346         video_uploader = mobj.group(1)
1347
1348         return [{
1349             'id':       video_id,
1350             'url':      video_url,
1351             'uploader': video_uploader,
1352             'upload_date':  None,
1353             'title':    video_title,
1354             'ext':      video_extension,
1355         }]
1356
1357
1358 class YoutubeSearchIE(InfoExtractor):
1359     """Information Extractor for YouTube search queries."""
1360     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362     _max_youtube_results = 1000
1363     IE_NAME = u'youtube:search'
1364
1365     def __init__(self, downloader=None):
1366         InfoExtractor.__init__(self, downloader)
1367
1368     def report_download_page(self, query, pagenum):
1369         """Report attempt to download search page with given number."""
1370         query = query.decode(preferredencoding())
1371         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1372
1373     def _real_extract(self, query):
1374         mobj = re.match(self._VALID_URL, query)
1375         if mobj is None:
1376             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1377             return
1378
1379         prefix, query = query.split(':')
1380         prefix = prefix[8:]
1381         query = query.encode('utf-8')
1382         if prefix == '':
1383             self._download_n_results(query, 1)
1384             return
1385         elif prefix == 'all':
1386             self._download_n_results(query, self._max_youtube_results)
1387             return
1388         else:
1389             try:
1390                 n = int(prefix)
1391                 if n <= 0:
1392                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1393                     return
1394                 elif n > self._max_youtube_results:
1395                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396                     n = self._max_youtube_results
1397                 self._download_n_results(query, n)
1398                 return
1399             except ValueError: # parsing prefix as integer fails
1400                 self._download_n_results(query, 1)
1401                 return
1402
1403     def _download_n_results(self, query, n):
1404         """Downloads a specified number of results for a query"""
1405
1406         video_ids = []
1407         pagenum = 0
1408         limit = n
1409
1410         while (50 * pagenum) < limit:
1411             self.report_download_page(query, pagenum+1)
1412             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413             request = compat_urllib_request.Request(result_url)
1414             try:
1415                 data = compat_urllib_request.urlopen(request).read()
1416             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1418                 return
1419             api_response = json.loads(data)['data']
1420
1421             new_ids = list(video['id'] for video in api_response['items'])
1422             video_ids += new_ids
1423
1424             limit = min(n, api_response['totalItems'])
1425             pagenum += 1
1426
1427         if len(video_ids) > n:
1428             video_ids = video_ids[:n]
1429         for id in video_ids:
1430             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1431         return
1432
1433
1434 class GoogleSearchIE(InfoExtractor):
1435     """Information Extractor for Google Video search queries."""
1436     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440     _max_google_results = 1000
1441     IE_NAME = u'video.google:search'
1442
1443     def __init__(self, downloader=None):
1444         InfoExtractor.__init__(self, downloader)
1445
1446     def report_download_page(self, query, pagenum):
1447         """Report attempt to download playlist page with given number."""
1448         query = query.decode(preferredencoding())
1449         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1450
1451     def _real_extract(self, query):
1452         mobj = re.match(self._VALID_URL, query)
1453         if mobj is None:
1454             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455             return
1456
1457         prefix, query = query.split(':')
1458         prefix = prefix[8:]
1459         query = query.encode('utf-8')
1460         if prefix == '':
1461             self._download_n_results(query, 1)
1462             return
1463         elif prefix == 'all':
1464             self._download_n_results(query, self._max_google_results)
1465             return
1466         else:
1467             try:
1468                 n = int(prefix)
1469                 if n <= 0:
1470                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1471                     return
1472                 elif n > self._max_google_results:
1473                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474                     n = self._max_google_results
1475                 self._download_n_results(query, n)
1476                 return
1477             except ValueError: # parsing prefix as integer fails
1478                 self._download_n_results(query, 1)
1479                 return
1480
1481     def _download_n_results(self, query, n):
1482         """Downloads a specified number of results for a query"""
1483
1484         video_ids = []
1485         pagenum = 0
1486
1487         while True:
1488             self.report_download_page(query, pagenum)
1489             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490             request = compat_urllib_request.Request(result_url)
1491             try:
1492                 page = compat_urllib_request.urlopen(request).read()
1493             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1495                 return
1496
1497             # Extract video identifiers
1498             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499                 video_id = mobj.group(1)
1500                 if video_id not in video_ids:
1501                     video_ids.append(video_id)
1502                     if len(video_ids) == n:
1503                         # Specified n videos reached
1504                         for id in video_ids:
1505                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506                         return
1507
1508             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509                 for id in video_ids:
1510                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511                 return
1512
1513             pagenum = pagenum + 1
1514
1515
1516 class YahooSearchIE(InfoExtractor):
1517     """Information Extractor for Yahoo! Video search queries."""
1518
1519     _WORKING = False
1520     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1521     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1522     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1523     _MORE_PAGES_INDICATOR = r'\s*Next'
1524     _max_yahoo_results = 1000
1525     IE_NAME = u'video.yahoo:search'
1526
1527     def __init__(self, downloader=None):
1528         InfoExtractor.__init__(self, downloader)
1529
1530     def report_download_page(self, query, pagenum):
1531         """Report attempt to download playlist page with given number."""
1532         query = query.decode(preferredencoding())
1533         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1534
1535     def _real_extract(self, query):
1536         mobj = re.match(self._VALID_URL, query)
1537         if mobj is None:
1538             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1539             return
1540
1541         prefix, query = query.split(':')
1542         prefix = prefix[8:]
1543         query = query.encode('utf-8')
1544         if prefix == '':
1545             self._download_n_results(query, 1)
1546             return
1547         elif prefix == 'all':
1548             self._download_n_results(query, self._max_yahoo_results)
1549             return
1550         else:
1551             try:
1552                 n = int(prefix)
1553                 if n <= 0:
1554                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1555                     return
1556                 elif n > self._max_yahoo_results:
1557                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1558                     n = self._max_yahoo_results
1559                 self._download_n_results(query, n)
1560                 return
1561             except ValueError: # parsing prefix as integer fails
1562                 self._download_n_results(query, 1)
1563                 return
1564
1565     def _download_n_results(self, query, n):
1566         """Downloads a specified number of results for a query"""
1567
1568         video_ids = []
1569         already_seen = set()
1570         pagenum = 1
1571
1572         while True:
1573             self.report_download_page(query, pagenum)
1574             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1575             request = compat_urllib_request.Request(result_url)
1576             try:
1577                 page = compat_urllib_request.urlopen(request).read()
1578             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1579                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1580                 return
1581
1582             # Extract video identifiers
1583             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1584                 video_id = mobj.group(1)
1585                 if video_id not in already_seen:
1586                     video_ids.append(video_id)
1587                     already_seen.add(video_id)
1588                     if len(video_ids) == n:
1589                         # Specified n videos reached
1590                         for id in video_ids:
1591                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592                         return
1593
1594             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595                 for id in video_ids:
1596                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597                 return
1598
1599             pagenum = pagenum + 1
1600
1601
1602 class YoutubePlaylistIE(InfoExtractor):
1603     """Information Extractor for YouTube playlists."""
1604
1605     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1606     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1607     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1608     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1609     IE_NAME = u'youtube:playlist'
1610
1611     def __init__(self, downloader=None):
1612         InfoExtractor.__init__(self, downloader)
1613
1614     def report_download_page(self, playlist_id, pagenum):
1615         """Report attempt to download playlist page with given number."""
1616         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1617
1618     def _real_extract(self, url):
1619         # Extract playlist id
1620         mobj = re.match(self._VALID_URL, url)
1621         if mobj is None:
1622             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1623             return
1624
1625         # Single video case
1626         if mobj.group(3) is not None:
1627             self._downloader.download([mobj.group(3)])
1628             return
1629
1630         # Download playlist pages
1631         # prefix is 'p' as default for playlists but there are other types that need extra care
1632         playlist_prefix = mobj.group(1)
1633         if playlist_prefix == 'a':
1634             playlist_access = 'artist'
1635         else:
1636             playlist_prefix = 'p'
1637             playlist_access = 'view_play_list'
1638         playlist_id = mobj.group(2)
1639         video_ids = []
1640         pagenum = 1
1641
1642         while True:
1643             self.report_download_page(playlist_id, pagenum)
1644             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1645             request = compat_urllib_request.Request(url)
1646             try:
1647                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1648             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650                 return
1651
1652             # Extract video identifiers
1653             ids_in_page = []
1654             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1655                 if mobj.group(1) not in ids_in_page:
1656                     ids_in_page.append(mobj.group(1))
1657             video_ids.extend(ids_in_page)
1658
1659             if self._MORE_PAGES_INDICATOR not in page:
1660                 break
1661             pagenum = pagenum + 1
1662
1663         total = len(video_ids)
1664
1665         playliststart = self._downloader.params.get('playliststart', 1) - 1
1666         playlistend = self._downloader.params.get('playlistend', -1)
1667         if playlistend == -1:
1668             video_ids = video_ids[playliststart:]
1669         else:
1670             video_ids = video_ids[playliststart:playlistend]
1671
1672         if len(video_ids) == total:
1673             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1674         else:
1675             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1676
1677         for id in video_ids:
1678             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1679         return
1680
1681
1682 class YoutubeChannelIE(InfoExtractor):
1683     """Information Extractor for YouTube channels."""
1684
1685     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1686     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688     IE_NAME = u'youtube:channel'
1689
1690     def report_download_page(self, channel_id, pagenum):
1691         """Report attempt to download channel page with given number."""
1692         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1693
1694     def _real_extract(self, url):
1695         # Extract channel id
1696         mobj = re.match(self._VALID_URL, url)
1697         if mobj is None:
1698             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1699             return
1700
1701         # Download channel pages
1702         channel_id = mobj.group(1)
1703         video_ids = []
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(channel_id, pagenum)
1708             url = self._TEMPLATE_URL % (channel_id, pagenum)
1709             request = compat_urllib_request.Request(url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             ids_in_page = []
1718             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1719                 if mobj.group(1) not in ids_in_page:
1720                     ids_in_page.append(mobj.group(1))
1721             video_ids.extend(ids_in_page)
1722
1723             if self._MORE_PAGES_INDICATOR not in page:
1724                 break
1725             pagenum = pagenum + 1
1726
1727         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1728
1729         for id in video_ids:
1730             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1731         return
1732
1733
1734 class YoutubeUserIE(InfoExtractor):
1735     """Information Extractor for YouTube users."""
1736
1737     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1738     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1739     _GDATA_PAGE_SIZE = 50
1740     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1741     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1742     IE_NAME = u'youtube:user'
1743
1744     def __init__(self, downloader=None):
1745         InfoExtractor.__init__(self, downloader)
1746
1747     def report_download_page(self, username, start_index):
1748         """Report attempt to download user page."""
1749         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1750                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1751
1752     def _real_extract(self, url):
1753         # Extract username
1754         mobj = re.match(self._VALID_URL, url)
1755         if mobj is None:
1756             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757             return
1758
1759         username = mobj.group(1)
1760
1761         # Download video ids using YouTube Data API. Result size per
1762         # query is limited (currently to 50 videos) so we need to query
1763         # page by page until there are no video ids - it means we got
1764         # all of them.
1765
1766         video_ids = []
1767         pagenum = 0
1768
1769         while True:
1770             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1771             self.report_download_page(username, start_index)
1772
1773             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1774
1775             try:
1776                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1777             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1779                 return
1780
1781             # Extract video identifiers
1782             ids_in_page = []
1783
1784             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785                 if mobj.group(1) not in ids_in_page:
1786                     ids_in_page.append(mobj.group(1))
1787
1788             video_ids.extend(ids_in_page)
1789
1790             # A little optimization - if current page is not
1791             # "full", ie. does not contain PAGE_SIZE video ids then
1792             # we can assume that this page is the last one - there
1793             # are no more ids on further pages - no need to query
1794             # again.
1795
1796             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1797                 break
1798
1799             pagenum += 1
1800
1801         all_ids_count = len(video_ids)
1802         playliststart = self._downloader.params.get('playliststart', 1) - 1
1803         playlistend = self._downloader.params.get('playlistend', -1)
1804
1805         if playlistend == -1:
1806             video_ids = video_ids[playliststart:]
1807         else:
1808             video_ids = video_ids[playliststart:playlistend]
1809
1810         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1811                 (username, all_ids_count, len(video_ids)))
1812
1813         for video_id in video_ids:
1814             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1815
1816
1817 class BlipTVUserIE(InfoExtractor):
1818     """Information Extractor for blip.tv users."""
1819
1820     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1821     _PAGE_SIZE = 12
1822     IE_NAME = u'blip.tv:user'
1823
1824     def __init__(self, downloader=None):
1825         InfoExtractor.__init__(self, downloader)
1826
1827     def report_download_page(self, username, pagenum):
1828         """Report attempt to download user page."""
1829         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1830                 (self.IE_NAME, username, pagenum))
1831
1832     def _real_extract(self, url):
1833         # Extract username
1834         mobj = re.match(self._VALID_URL, url)
1835         if mobj is None:
1836             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837             return
1838
1839         username = mobj.group(1)
1840
1841         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1842
1843         request = compat_urllib_request.Request(url)
1844
1845         try:
1846             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847             mobj = re.search(r'data-users-id="([^"]+)"', page)
1848             page_base = page_base % mobj.group(1)
1849         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1850             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1851             return
1852
1853
1854         # Download video ids using BlipTV Ajax calls. Result size per
1855         # query is limited (currently to 12 videos) so we need to query
1856         # page by page until there are no video ids - it means we got
1857         # all of them.
1858
1859         video_ids = []
1860         pagenum = 1
1861
1862         while True:
1863             self.report_download_page(username, pagenum)
1864
1865             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1866
1867             try:
1868                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1869             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1870                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871                 return
1872
1873             # Extract video identifiers
1874             ids_in_page = []
1875
1876             for mobj in re.finditer(r'href="/([^"]+)"', page):
1877                 if mobj.group(1) not in ids_in_page:
1878                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1879
1880             video_ids.extend(ids_in_page)
1881
1882             # A little optimization - if current page is not
1883             # "full", ie. does not contain PAGE_SIZE video ids then
1884             # we can assume that this page is the last one - there
1885             # are no more ids on further pages - no need to query
1886             # again.
1887
1888             if len(ids_in_page) < self._PAGE_SIZE:
1889                 break
1890
1891             pagenum += 1
1892
1893         all_ids_count = len(video_ids)
1894         playliststart = self._downloader.params.get('playliststart', 1) - 1
1895         playlistend = self._downloader.params.get('playlistend', -1)
1896
1897         if playlistend == -1:
1898             video_ids = video_ids[playliststart:]
1899         else:
1900             video_ids = video_ids[playliststart:playlistend]
1901
1902         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1903                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1904
1905         for video_id in video_ids:
1906             self._downloader.download([u'http://blip.tv/'+video_id])
1907
1908
1909 class DepositFilesIE(InfoExtractor):
1910     """Information extractor for depositfiles.com"""
1911
1912     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1913     IE_NAME = u'DepositFiles'
1914
1915     def __init__(self, downloader=None):
1916         InfoExtractor.__init__(self, downloader)
1917
1918     def report_download_webpage(self, file_id):
1919         """Report webpage download."""
1920         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1921
1922     def report_extraction(self, file_id):
1923         """Report information extraction."""
1924         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1925
1926     def _real_extract(self, url):
1927         file_id = url.split('/')[-1]
1928         # Rebuild url in english locale
1929         url = 'http://depositfiles.com/en/files/' + file_id
1930
1931         # Retrieve file webpage with 'Free download' button pressed
1932         free_download_indication = { 'gateway_result' : '1' }
1933         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1934         try:
1935             self.report_download_webpage(file_id)
1936             webpage = compat_urllib_request.urlopen(request).read()
1937         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1939             return
1940
1941         # Search for the real file URL
1942         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1943         if (mobj is None) or (mobj.group(1) is None):
1944             # Try to figure out reason of the error.
1945             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1946             if (mobj is not None) and (mobj.group(1) is not None):
1947                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1948                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1949             else:
1950                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1951             return
1952
1953         file_url = mobj.group(1)
1954         file_extension = os.path.splitext(file_url)[1][1:]
1955
1956         # Search for file title
1957         mobj = re.search(r'<b title="(.*?)">', webpage)
1958         if mobj is None:
1959             self._downloader.trouble(u'ERROR: unable to extract title')
1960             return
1961         file_title = mobj.group(1).decode('utf-8')
1962
1963         return [{
1964             'id':       file_id.decode('utf-8'),
1965             'url':      file_url.decode('utf-8'),
1966             'uploader': None,
1967             'upload_date':  None,
1968             'title':    file_title,
1969             'ext':      file_extension.decode('utf-8'),
1970         }]
1971
1972
1973 class FacebookIE(InfoExtractor):
1974     """Information Extractor for Facebook"""
1975
1976     _WORKING = False
1977     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1978     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1979     _NETRC_MACHINE = 'facebook'
1980     _available_formats = ['video', 'highqual', 'lowqual']
1981     _video_extensions = {
1982         'video': 'mp4',
1983         'highqual': 'mp4',
1984         'lowqual': 'mp4',
1985     }
1986     IE_NAME = u'facebook'
1987
1988     def __init__(self, downloader=None):
1989         InfoExtractor.__init__(self, downloader)
1990
1991     def _reporter(self, message):
1992         """Add header and report message."""
1993         self._downloader.to_screen(u'[facebook] %s' % message)
1994
1995     def report_login(self):
1996         """Report attempt to log in."""
1997         self._reporter(u'Logging in')
1998
1999     def report_video_webpage_download(self, video_id):
2000         """Report attempt to download video webpage."""
2001         self._reporter(u'%s: Downloading video webpage' % video_id)
2002
2003     def report_information_extraction(self, video_id):
2004         """Report attempt to extract video information."""
2005         self._reporter(u'%s: Extracting video information' % video_id)
2006
2007     def _parse_page(self, video_webpage):
2008         """Extract video information from page"""
2009         # General data
2010         data = {'title': r'\("video_title", "(.*?)"\)',
2011             'description': r'<div class="datawrap">(.*?)</div>',
2012             'owner': r'\("video_owner_name", "(.*?)"\)',
2013             'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2014             }
2015         video_info = {}
2016         for piece in data.keys():
2017             mobj = re.search(data[piece], video_webpage)
2018             if mobj is not None:
2019                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2020
2021         # Video urls
2022         video_urls = {}
2023         for fmt in self._available_formats:
2024             mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2025             if mobj is not None:
2026                 # URL is in a Javascript segment inside an escaped Unicode format within
2027                 # the generally utf-8 page
2028                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2029         video_info['video_urls'] = video_urls
2030
2031         return video_info
2032
2033     def _real_initialize(self):
2034         if self._downloader is None:
2035             return
2036
2037         useremail = None
2038         password = None
2039         downloader_params = self._downloader.params
2040
2041         # Attempt to use provided username and password or .netrc data
2042         if downloader_params.get('username', None) is not None:
2043             useremail = downloader_params['username']
2044             password = downloader_params['password']
2045         elif downloader_params.get('usenetrc', False):
2046             try:
2047                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2048                 if info is not None:
2049                     useremail = info[0]
2050                     password = info[2]
2051                 else:
2052                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2053             except (IOError, netrc.NetrcParseError) as err:
2054                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2055                 return
2056
2057         if useremail is None:
2058             return
2059
2060         # Log in
2061         login_form = {
2062             'email': useremail,
2063             'pass': password,
2064             'login': 'Log+In'
2065             }
2066         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2067         try:
2068             self.report_login()
2069             login_results = compat_urllib_request.urlopen(request).read()
2070             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2071                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2072                 return
2073         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2074             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2075             return
2076
2077     def _real_extract(self, url):
2078         mobj = re.match(self._VALID_URL, url)
2079         if mobj is None:
2080             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2081             return
2082         video_id = mobj.group('ID')
2083
2084         # Get video webpage
2085         self.report_video_webpage_download(video_id)
2086         request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2087         try:
2088             page = compat_urllib_request.urlopen(request)
2089             video_webpage = page.read()
2090         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2091             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2092             return
2093
2094         # Start extracting information
2095         self.report_information_extraction(video_id)
2096
2097         # Extract information
2098         video_info = self._parse_page(video_webpage)
2099
2100         # uploader
2101         if 'owner' not in video_info:
2102             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2103             return
2104         video_uploader = video_info['owner']
2105
2106         # title
2107         if 'title' not in video_info:
2108             self._downloader.trouble(u'ERROR: unable to extract video title')
2109             return
2110         video_title = video_info['title']
2111         video_title = video_title.decode('utf-8')
2112
2113         # thumbnail image
2114         if 'thumbnail' not in video_info:
2115             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2116             video_thumbnail = ''
2117         else:
2118             video_thumbnail = video_info['thumbnail']
2119
2120         # upload date
2121         upload_date = None
2122         if 'upload_date' in video_info:
2123             upload_time = video_info['upload_date']
2124             timetuple = email.utils.parsedate_tz(upload_time)
2125             if timetuple is not None:
2126                 try:
2127                     upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2128                 except:
2129                     pass
2130
2131         # description
2132         video_description = video_info.get('description', 'No description available.')
2133
2134         url_map = video_info['video_urls']
2135         if url_map:
2136             # Decide which formats to download
2137             req_format = self._downloader.params.get('format', None)
2138             format_limit = self._downloader.params.get('format_limit', None)
2139
2140             if format_limit is not None and format_limit in self._available_formats:
2141                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2142             else:
2143                 format_list = self._available_formats
2144             existing_formats = [x for x in format_list if x in url_map]
2145             if len(existing_formats) == 0:
2146                 self._downloader.trouble(u'ERROR: no known formats available for video')
2147                 return
2148             if req_format is None:
2149                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2150             elif req_format == 'worst':
2151                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2152             elif req_format == '-1':
2153                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2154             else:
2155                 # Specific format
2156                 if req_format not in url_map:
2157                     self._downloader.trouble(u'ERROR: requested format not available')
2158                     return
2159                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2160
2161         results = []
2162         for format_param, video_real_url in video_url_list:
2163             # Extension
2164             video_extension = self._video_extensions.get(format_param, 'mp4')
2165
2166             results.append({
2167                 'id':       video_id.decode('utf-8'),
2168                 'url':      video_real_url.decode('utf-8'),
2169                 'uploader': video_uploader.decode('utf-8'),
2170                 'upload_date':  upload_date,
2171                 'title':    video_title,
2172                 'ext':      video_extension.decode('utf-8'),
2173                 'format':   (format_param is None and u'NA' or format_param.decode('utf-8')),
2174                 'thumbnail':    video_thumbnail.decode('utf-8'),
2175                 'description':  video_description.decode('utf-8'),
2176             })
2177         return results
2178
2179 class BlipTVIE(InfoExtractor):
2180     """Information extractor for blip.tv"""
2181
2182     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2183     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2184     IE_NAME = u'blip.tv'
2185
2186     def report_extraction(self, file_id):
2187         """Report information extraction."""
2188         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2189
2190     def report_direct_download(self, title):
2191         """Report information extraction."""
2192         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2193
2194     def _real_extract(self, url):
2195         mobj = re.match(self._VALID_URL, url)
2196         if mobj is None:
2197             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2198             return
2199
2200         if '?' in url:
2201             cchar = '&'
2202         else:
2203             cchar = '?'
2204         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2205         request = compat_urllib_request.Request(json_url)
2206         self.report_extraction(mobj.group(1))
2207         info = None
2208         try:
2209             urlh = compat_urllib_request.urlopen(request)
2210             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2211                 basename = url.split('/')[-1]
2212                 title,ext = os.path.splitext(basename)
2213                 title = title.decode('UTF-8')
2214                 ext = ext.replace('.', '')
2215                 self.report_direct_download(title)
2216                 info = {
2217                     'id': title,
2218                     'url': url,
2219                     'uploader': None,
2220                     'upload_date': None,
2221                     'title': title,
2222                     'ext': ext,
2223                     'urlhandle': urlh
2224                 }
2225         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2226             self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2227             return
2228         if info is None: # Regular URL
2229             try:
2230                 json_code_bytes = urlh.read()
2231                 json_code = json_code_bytes.decode('utf-8')
2232             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2233                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2234                 return
2235
2236             try:
2237                 json_data = json.loads(json_code)
2238                 if 'Post' in json_data:
2239                     data = json_data['Post']
2240                 else:
2241                     data = json_data
2242
2243                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2244                 video_url = data['media']['url']
2245                 umobj = re.match(self._URL_EXT, video_url)
2246                 if umobj is None:
2247                     raise ValueError('Can not determine filename extension')
2248                 ext = umobj.group(1)
2249
2250                 info = {
2251                     'id': data['item_id'],
2252                     'url': video_url,
2253                     'uploader': data['display_name'],
2254                     'upload_date': upload_date,
2255                     'title': data['title'],
2256                     'ext': ext,
2257                     'format': data['media']['mimeType'],
2258                     'thumbnail': data['thumbnailUrl'],
2259                     'description': data['description'],
2260                     'player_url': data['embedUrl']
2261                 }
2262             except (ValueError,KeyError) as err:
2263                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2264                 return
2265
2266         std_headers['User-Agent'] = 'iTunes/10.6.1'
2267         return [info]
2268
2269
2270 class MyVideoIE(InfoExtractor):
2271     """Information Extractor for myvideo.de."""
2272
2273     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2274     IE_NAME = u'myvideo'
2275
2276     def __init__(self, downloader=None):
2277         InfoExtractor.__init__(self, downloader)
2278
2279     def report_download_webpage(self, video_id):
2280         """Report webpage download."""
2281         self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2282
2283     def report_extraction(self, video_id):
2284         """Report information extraction."""
2285         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2286
2287     def _real_extract(self,url):
2288         mobj = re.match(self._VALID_URL, url)
2289         if mobj is None:
2290             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2291             return
2292
2293         video_id = mobj.group(1)
2294
2295         # Get video webpage
2296         request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2297         try:
2298             self.report_download_webpage(video_id)
2299             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2301             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2302             return
2303
2304         self.report_extraction(video_id)
2305         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2306                  webpage)
2307         if mobj is None:
2308             self._downloader.trouble(u'ERROR: unable to extract media URL')
2309             return
2310         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2311
2312         mobj = re.search('<title>([^<]+)</title>', webpage)
2313         if mobj is None:
2314             self._downloader.trouble(u'ERROR: unable to extract title')
2315             return
2316
2317         video_title = mobj.group(1)
2318
2319         return [{
2320             'id':       video_id,
2321             'url':      video_url,
2322             'uploader': None,
2323             'upload_date':  None,
2324             'title':    video_title,
2325             'ext':      u'flv',
2326         }]
2327
2328 class ComedyCentralIE(InfoExtractor):
2329     """Information extractor for The Daily Show and Colbert Report """
2330
2331     # urls can be abbreviations like :thedailyshow or :colbert
2332     # urls for episodes like:
2333     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2334     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2335     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2336     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2337                       |(https?://)?(www\.)?
2338                           (?P<showname>thedailyshow|colbertnation)\.com/
2339                          (full-episodes/(?P<episode>.*)|
2340                           (?P<clip>
2341                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2342                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2343                      $"""
2344     IE_NAME = u'comedycentral'
2345
2346     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2347
2348     _video_extensions = {
2349         '3500': 'mp4',
2350         '2200': 'mp4',
2351         '1700': 'mp4',
2352         '1200': 'mp4',
2353         '750': 'mp4',
2354         '400': 'mp4',
2355     }
2356     _video_dimensions = {
2357         '3500': '1280x720',
2358         '2200': '960x540',
2359         '1700': '768x432',
2360         '1200': '640x360',
2361         '750': '512x288',
2362         '400': '384x216',
2363     }
2364
2365     def suitable(self, url):
2366         """Receives a URL and returns True if suitable for this IE."""
2367         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2368
2369     def report_extraction(self, episode_id):
2370         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2371
2372     def report_config_download(self, episode_id):
2373         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2374
2375     def report_index_download(self, episode_id):
2376         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2377
2378     def report_player_url(self, episode_id):
2379         self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2380
2381
2382     def _print_formats(self, formats):
2383         print('Available formats:')
2384         for x in formats:
2385             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2386
2387
2388     def _real_extract(self, url):
2389         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2390         if mobj is None:
2391             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2392             return
2393
2394         if mobj.group('shortname'):
2395             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2396                 url = u'http://www.thedailyshow.com/full-episodes/'
2397             else:
2398                 url = u'http://www.colbertnation.com/full-episodes/'
2399             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2400             assert mobj is not None
2401
2402         if mobj.group('clip'):
2403             if mobj.group('showname') == 'thedailyshow':
2404                 epTitle = mobj.group('tdstitle')
2405             else:
2406                 epTitle = mobj.group('cntitle')
2407             dlNewest = False
2408         else:
2409             dlNewest = not mobj.group('episode')
2410             if dlNewest:
2411                 epTitle = mobj.group('showname')
2412             else:
2413                 epTitle = mobj.group('episode')
2414
2415         req = compat_urllib_request.Request(url)
2416         self.report_extraction(epTitle)
2417         try:
2418             htmlHandle = compat_urllib_request.urlopen(req)
2419             html = htmlHandle.read()
2420         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2422             return
2423         if dlNewest:
2424             url = htmlHandle.geturl()
2425             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2426             if mobj is None:
2427                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2428                 return
2429             if mobj.group('episode') == '':
2430                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2431                 return
2432             epTitle = mobj.group('episode')
2433
2434         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2435
2436         if len(mMovieParams) == 0:
2437             # The Colbert Report embeds the information in a without
2438             # a URL prefix; so extract the alternate reference
2439             # and then add the URL prefix manually.
2440
2441             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2442             if len(altMovieParams) == 0:
2443                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2444                 return
2445             else:
2446                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2447
2448         playerUrl_raw = mMovieParams[0][0]
2449         self.report_player_url(epTitle)
2450         try:
2451             urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2452             playerUrl = urlHandle.geturl()
2453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454             self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2455             return
2456
2457         uri = mMovieParams[0][1]
2458         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2459         self.report_index_download(epTitle)
2460         try:
2461             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2464             return
2465
2466         results = []
2467
2468         idoc = xml.etree.ElementTree.fromstring(indexXml)
2469         itemEls = idoc.findall('.//item')
2470         for itemEl in itemEls:
2471             mediaId = itemEl.findall('./guid')[0].text
2472             shortMediaId = mediaId.split(':')[-1]
2473             showId = mediaId.split(':')[-2].replace('.com', '')
2474             officialTitle = itemEl.findall('./title')[0].text
2475             officialDate = itemEl.findall('./pubDate')[0].text
2476
2477             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2478                         compat_urllib_parse.urlencode({'uri': mediaId}))
2479             configReq = compat_urllib_request.Request(configUrl)
2480             self.report_config_download(epTitle)
2481             try:
2482                 configXml = compat_urllib_request.urlopen(configReq).read()
2483             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2485                 return
2486
2487             cdoc = xml.etree.ElementTree.fromstring(configXml)
2488             turls = []
2489             for rendition in cdoc.findall('.//rendition'):
2490                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2491                 turls.append(finfo)
2492
2493             if len(turls) == 0:
2494                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2495                 continue
2496
2497             if self._downloader.params.get('listformats', None):
2498                 self._print_formats([i[0] for i in turls])
2499                 return
2500
2501             # For now, just pick the highest bitrate
2502             format,video_url = turls[-1]
2503
2504             # Get the format arg from the arg stream
2505             req_format = self._downloader.params.get('format', None)
2506
2507             # Select format if we can find one
2508             for f,v in turls:
2509                 if f == req_format:
2510                     format, video_url = f, v
2511                     break
2512
2513             # Patch to download from alternative CDN, which does not
2514             # break on current RTMPDump builds
2515             broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2516             better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2517
2518             if video_url.startswith(broken_cdn):
2519                 video_url = video_url.replace(broken_cdn, better_cdn)
2520
2521             effTitle = showId + u'-' + epTitle
2522             info = {
2523                 'id': shortMediaId,
2524                 'url': video_url,
2525                 'uploader': showId,
2526                 'upload_date': officialDate,
2527                 'title': effTitle,
2528                 'ext': 'mp4',
2529                 'format': format,
2530                 'thumbnail': None,
2531                 'description': officialTitle,
2532                 'player_url': None #playerUrl
2533             }
2534
2535             results.append(info)
2536
2537         return results
2538
2539
2540 class EscapistIE(InfoExtractor):
2541     """Information extractor for The Escapist """
2542
2543     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2544     IE_NAME = u'escapist'
2545
2546     def report_extraction(self, showName):
2547         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2548
2549     def report_config_download(self, showName):
2550         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2551
2552     def _real_extract(self, url):
2553         mobj = re.match(self._VALID_URL, url)
2554         if mobj is None:
2555             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2556             return
2557         showName = mobj.group('showname')
2558         videoId = mobj.group('episode')
2559
2560         self.report_extraction(showName)
2561         try:
2562             webPage = compat_urllib_request.urlopen(url)
2563             webPageBytes = webPage.read()
2564             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2565             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2566         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2567             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2568             return
2569
2570         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2571         description = unescapeHTML(descMatch.group(1))
2572         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2573         imgUrl = unescapeHTML(imgMatch.group(1))
2574         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2575         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2576         configUrlMatch = re.search('config=(.*)$', playerUrl)
2577         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2578
2579         self.report_config_download(showName)
2580         try:
2581             configJSON = compat_urllib_request.urlopen(configUrl)
2582             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2583             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2584         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2585             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2586             return
2587
2588         # Technically, it's JavaScript, not JSON
2589         configJSON = configJSON.replace("'", '"')
2590
2591         try:
2592             config = json.loads(configJSON)
2593         except (ValueError,) as err:
2594             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2595             return
2596
2597         playlist = config['playlist']
2598         videoUrl = playlist[1]['url']
2599
2600         info = {
2601             'id': videoId,
2602             'url': videoUrl,
2603             'uploader': showName,
2604             'upload_date': None,
2605             'title': showName,
2606             'ext': 'flv',
2607             'thumbnail': imgUrl,
2608             'description': description,
2609             'player_url': playerUrl,
2610         }
2611
2612         return [info]
2613
2614
2615 class CollegeHumorIE(InfoExtractor):
2616     """Information extractor for collegehumor.com"""
2617
2618     _WORKING = False
2619     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2620     IE_NAME = u'collegehumor'
2621
2622     def report_manifest(self, video_id):
2623         """Report information extraction."""
2624         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2625
2626     def report_extraction(self, video_id):
2627         """Report information extraction."""
2628         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2629
2630     def _real_extract(self, url):
2631         mobj = re.match(self._VALID_URL, url)
2632         if mobj is None:
2633             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2634             return
2635         video_id = mobj.group('videoid')
2636
2637         info = {
2638             'id': video_id,
2639             'uploader': None,
2640             'upload_date': None,
2641         }
2642
2643         self.report_extraction(video_id)
2644         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2645         try:
2646             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2647         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2649             return
2650
2651         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2652         try:
2653             videoNode = mdoc.findall('./video')[0]
2654             info['description'] = videoNode.findall('./description')[0].text
2655             info['title'] = videoNode.findall('./caption')[0].text
2656             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2657             manifest_url = videoNode.findall('./file')[0].text
2658         except IndexError:
2659             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2660             return
2661
2662         manifest_url += '?hdcore=2.10.3'
2663         self.report_manifest(video_id)
2664         try:
2665             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2666         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2668             return
2669
2670         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2671         try:
2672             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2673             node_id = media_node.attrib['url']
2674             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2675         except IndexError as err:
2676             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2677             return
2678
2679         url_pr = compat_urllib_parse_urlparse(manifest_url)
2680         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2681
2682         info['url'] = url
2683         info['ext'] = 'f4f'
2684         return [info]
2685
2686
2687 class XVideosIE(InfoExtractor):
2688     """Information extractor for xvideos.com"""
2689
2690     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2691     IE_NAME = u'xvideos'
2692
2693     def report_webpage(self, video_id):
2694         """Report information extraction."""
2695         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2696
2697     def report_extraction(self, video_id):
2698         """Report information extraction."""
2699         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2700
2701     def _real_extract(self, url):
2702         mobj = re.match(self._VALID_URL, url)
2703         if mobj is None:
2704             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2705             return
2706         video_id = mobj.group(1)
2707
2708         self.report_webpage(video_id)
2709
2710         request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2711         try:
2712             webpage_bytes = compat_urllib_request.urlopen(request).read()
2713             webpage = webpage_bytes.decode('utf-8', 'replace')
2714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2716             return
2717
2718         self.report_extraction(video_id)
2719
2720
2721         # Extract video URL
2722         mobj = re.search(r'flv_url=(.+?)&', webpage)
2723         if mobj is None:
2724             self._downloader.trouble(u'ERROR: unable to extract video url')
2725             return
2726         video_url = compat_urllib_parse.unquote(mobj.group(1))
2727
2728
2729         # Extract title
2730         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2731         if mobj is None:
2732             self._downloader.trouble(u'ERROR: unable to extract video title')
2733             return
2734         video_title = mobj.group(1)
2735
2736
2737         # Extract video thumbnail
2738         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2739         if mobj is None:
2740             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2741             return
2742         video_thumbnail = mobj.group(0)
2743
2744         info = {
2745             'id': video_id,
2746             'url': video_url,
2747             'uploader': None,
2748             'upload_date': None,
2749             'title': video_title,
2750             'ext': 'flv',
2751             'thumbnail': video_thumbnail,
2752             'description': None,
2753         }
2754
2755         return [info]
2756
2757
2758 class SoundcloudIE(InfoExtractor):
2759     """Information extractor for soundcloud.com
2760        To access the media, the uid of the song and a stream token
2761        must be extracted from the page source and the script must make
2762        a request to media.soundcloud.com/crossdomain.xml. Then
2763        the media can be grabbed by requesting from an url composed
2764        of the stream token and uid
2765      """
2766
2767     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2768     IE_NAME = u'soundcloud'
2769
2770     def __init__(self, downloader=None):
2771         InfoExtractor.__init__(self, downloader)
2772
2773     def report_resolve(self, video_id):
2774         """Report information extraction."""
2775         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2776
2777     def report_extraction(self, video_id):
2778         """Report information extraction."""
2779         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2780
2781     def _real_extract(self, url):
2782         mobj = re.match(self._VALID_URL, url)
2783         if mobj is None:
2784             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2785             return
2786
2787         # extract uploader (which is in the url)
2788         uploader = mobj.group(1)
2789         # extract simple title (uploader + slug of song title)
2790         slug_title =  mobj.group(2)
2791         simple_title = uploader + u'-' + slug_title
2792
2793         self.report_resolve('%s/%s' % (uploader, slug_title))
2794
2795         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2796         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2797         request = compat_urllib_request.Request(resolv_url)
2798         try:
2799             info_json_bytes = compat_urllib_request.urlopen(request).read()
2800             info_json = info_json_bytes.decode('utf-8')
2801         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2802             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2803             return
2804
2805         info = json.loads(info_json)
2806         video_id = info['id']
2807         self.report_extraction('%s/%s' % (uploader, slug_title))
2808
2809         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810         request = compat_urllib_request.Request(streams_url)
2811         try:
2812             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2813             stream_json = stream_json_bytes.decode('utf-8')
2814         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2816             return
2817
2818         streams = json.loads(stream_json)
2819         mediaURL = streams['http_mp3_128_url']
2820
2821         return [{
2822             'id':       info['id'],
2823             'url':      mediaURL,
2824             'uploader': info['user']['username'],
2825             'upload_date':  info['created_at'],
2826             'title':    info['title'],
2827             'ext':      u'mp3',
2828             'description': info['description'],
2829         }]
2830
2831
2832 class InfoQIE(InfoExtractor):
2833     """Information extractor for infoq.com"""
2834
2835     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2836     IE_NAME = u'infoq'
2837
2838     def report_webpage(self, video_id):
2839         """Report information extraction."""
2840         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2841
2842     def report_extraction(self, video_id):
2843         """Report information extraction."""
2844         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2845
2846     def _real_extract(self, url):
2847         mobj = re.match(self._VALID_URL, url)
2848         if mobj is None:
2849             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2850             return
2851
2852         self.report_webpage(url)
2853
2854         request = compat_urllib_request.Request(url)
2855         try:
2856             webpage = compat_urllib_request.urlopen(request).read()
2857         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2858             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2859             return
2860
2861         self.report_extraction(url)
2862
2863
2864         # Extract video URL
2865         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2866         if mobj is None:
2867             self._downloader.trouble(u'ERROR: unable to extract video url')
2868             return
2869         video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2870
2871
2872         # Extract title
2873         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2874         if mobj is None:
2875             self._downloader.trouble(u'ERROR: unable to extract video title')
2876             return
2877         video_title = mobj.group(1).decode('utf-8')
2878
2879         # Extract description
2880         video_description = u'No description available.'
2881         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2882         if mobj is not None:
2883             video_description = mobj.group(1).decode('utf-8')
2884
2885         video_filename = video_url.split('/')[-1]
2886         video_id, extension = video_filename.split('.')
2887
2888         info = {
2889             'id': video_id,
2890             'url': video_url,
2891             'uploader': None,
2892             'upload_date': None,
2893             'title': video_title,
2894             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2895             'thumbnail': None,
2896             'description': video_description,
2897         }
2898
2899         return [info]
2900
2901 class MixcloudIE(InfoExtractor):
2902     """Information extractor for www.mixcloud.com"""
2903
2904     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2905     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2906     IE_NAME = u'mixcloud'
2907
2908     def __init__(self, downloader=None):
2909         InfoExtractor.__init__(self, downloader)
2910
2911     def report_download_json(self, file_id):
2912         """Report JSON download."""
2913         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2914
2915     def report_extraction(self, file_id):
2916         """Report information extraction."""
2917         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2918
2919     def get_urls(self, jsonData, fmt, bitrate='best'):
2920         """Get urls from 'audio_formats' section in json"""
2921         file_url = None
2922         try:
2923             bitrate_list = jsonData[fmt]
2924             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2925                 bitrate = max(bitrate_list) # select highest
2926
2927             url_list = jsonData[fmt][bitrate]
2928         except TypeError: # we have no bitrate info.
2929             url_list = jsonData[fmt]
2930         return url_list
2931
2932     def check_urls(self, url_list):
2933         """Returns 1st active url from list"""
2934         for url in url_list:
2935             try:
2936                 compat_urllib_request.urlopen(url)
2937                 return url
2938             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2939                 url = None
2940
2941         return None
2942
2943     def _print_formats(self, formats):
2944         print('Available formats:')
2945         for fmt in formats.keys():
2946             for b in formats[fmt]:
2947                 try:
2948                     ext = formats[fmt][b][0]
2949                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2950                 except TypeError: # we have no bitrate info
2951                     ext = formats[fmt][0]
2952                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2953                     break
2954
2955     def _real_extract(self, url):
2956         mobj = re.match(self._VALID_URL, url)
2957         if mobj is None:
2958             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2959             return
2960         # extract uploader & filename from url
2961         uploader = mobj.group(1).decode('utf-8')
2962         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2963
2964         # construct API request
2965         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2966         # retrieve .json file with links to files
2967         request = compat_urllib_request.Request(file_url)
2968         try:
2969             self.report_download_json(file_url)
2970             jsonData = compat_urllib_request.urlopen(request).read()
2971         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2972             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2973             return
2974
2975         # parse JSON
2976         json_data = json.loads(jsonData)
2977         player_url = json_data['player_swf_url']
2978         formats = dict(json_data['audio_formats'])
2979
2980         req_format = self._downloader.params.get('format', None)
2981         bitrate = None
2982
2983         if self._downloader.params.get('listformats', None):
2984             self._print_formats(formats)
2985             return
2986
2987         if req_format is None or req_format == 'best':
2988             for format_param in formats.keys():
2989                 url_list = self.get_urls(formats, format_param)
2990                 # check urls
2991                 file_url = self.check_urls(url_list)
2992                 if file_url is not None:
2993                     break # got it!
2994         else:
2995             if req_format not in formats:
2996                 self._downloader.trouble(u'ERROR: format is not available')
2997                 return
2998
2999             url_list = self.get_urls(formats, req_format)
3000             file_url = self.check_urls(url_list)
3001             format_param = req_format
3002
3003         return [{
3004             'id': file_id.decode('utf-8'),
3005             'url': file_url.decode('utf-8'),
3006             'uploader': uploader.decode('utf-8'),
3007             'upload_date': None,
3008             'title': json_data['name'],
3009             'ext': file_url.split('.')[-1].decode('utf-8'),
3010             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3011             'thumbnail': json_data['thumbnail_url'],
3012             'description': json_data['description'],
3013             'player_url': player_url.decode('utf-8'),
3014         }]
3015
3016 class StanfordOpenClassroomIE(InfoExtractor):
3017     """Information extractor for Stanford's Open ClassRoom"""
3018
3019     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3020     IE_NAME = u'stanfordoc'
3021
3022     def report_download_webpage(self, objid):
3023         """Report information extraction."""
3024         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3025
3026     def report_extraction(self, video_id):
3027         """Report information extraction."""
3028         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3029
3030     def _real_extract(self, url):
3031         mobj = re.match(self._VALID_URL, url)
3032         if mobj is None:
3033             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3034             return
3035
3036         if mobj.group('course') and mobj.group('video'): # A specific video
3037             course = mobj.group('course')
3038             video = mobj.group('video')
3039             info = {
3040                 'id': course + '_' + video,
3041                 'uploader': None,
3042                 'upload_date': None,
3043             }
3044
3045             self.report_extraction(info['id'])
3046             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3047             xmlUrl = baseUrl + video + '.xml'
3048             try:
3049                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3050             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3051                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3052                 return
3053             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3054             try:
3055                 info['title'] = mdoc.findall('./title')[0].text
3056                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3057             except IndexError:
3058                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3059                 return
3060             info['ext'] = info['url'].rpartition('.')[2]
3061             return [info]
3062         elif mobj.group('course'): # A course page
3063             course = mobj.group('course')
3064             info = {
3065                 'id': course,
3066                 'type': 'playlist',
3067                 'uploader': None,
3068                 'upload_date': None,
3069             }
3070
3071             self.report_download_webpage(info['id'])
3072             try:
3073                 coursepage = compat_urllib_request.urlopen(url).read()
3074             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3075                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3076                 return
3077
3078             m = re.search('<h1>([^<]+)</h1>', coursepage)
3079             if m:
3080                 info['title'] = unescapeHTML(m.group(1))
3081             else:
3082                 info['title'] = info['id']
3083
3084             m = re.search('<description>([^<]+)</description>', coursepage)
3085             if m:
3086                 info['description'] = unescapeHTML(m.group(1))
3087
3088             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3089             info['list'] = [
3090                 {
3091                     'type': 'reference',
3092                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3093                 }
3094                     for vpage in links]
3095             results = []
3096             for entry in info['list']:
3097                 assert entry['type'] == 'reference'
3098                 results += self.extract(entry['url'])
3099             return results
3100
3101         else: # Root page
3102             info = {
3103                 'id': 'Stanford OpenClassroom',
3104                 'type': 'playlist',
3105                 'uploader': None,
3106                 'upload_date': None,
3107             }
3108
3109             self.report_download_webpage(info['id'])
3110             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3111             try:
3112                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3113             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3114                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3115                 return
3116
3117             info['title'] = info['id']
3118
3119             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3120             info['list'] = [
3121                 {
3122                     'type': 'reference',
3123                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3124                 }
3125                     for cpage in links]
3126
3127             results = []
3128             for entry in info['list']:
3129                 assert entry['type'] == 'reference'
3130                 results += self.extract(entry['url'])
3131             return results
3132
3133 class MTVIE(InfoExtractor):
3134     """Information extractor for MTV.com"""
3135
3136     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3137     IE_NAME = u'mtv'
3138
3139     def report_webpage(self, video_id):
3140         """Report information extraction."""
3141         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3142
3143     def report_extraction(self, video_id):
3144         """Report information extraction."""
3145         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3146
3147     def _real_extract(self, url):
3148         mobj = re.match(self._VALID_URL, url)
3149         if mobj is None:
3150             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3151             return
3152         if not mobj.group('proto'):
3153             url = 'http://' + url
3154         video_id = mobj.group('videoid')
3155         self.report_webpage(video_id)
3156
3157         request = compat_urllib_request.Request(url)
3158         try:
3159             webpage = compat_urllib_request.urlopen(request).read()
3160         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3162             return
3163
3164         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3165         if mobj is None:
3166             self._downloader.trouble(u'ERROR: unable to extract song name')
3167             return
3168         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3169         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3170         if mobj is None:
3171             self._downloader.trouble(u'ERROR: unable to extract performer')
3172             return
3173         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3174         video_title = performer + ' - ' + song_name
3175
3176         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3177         if mobj is None:
3178             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3179             return
3180         mtvn_uri = mobj.group(1)
3181
3182         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3183         if mobj is None:
3184             self._downloader.trouble(u'ERROR: unable to extract content id')
3185             return
3186         content_id = mobj.group(1)
3187
3188         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3189         self.report_extraction(video_id)
3190         request = compat_urllib_request.Request(videogen_url)
3191         try:
3192             metadataXml = compat_urllib_request.urlopen(request).read()
3193         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3194             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3195             return
3196
3197         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3198         renditions = mdoc.findall('.//rendition')
3199
3200         # For now, always pick the highest quality.
3201         rendition = renditions[-1]
3202
3203         try:
3204             _,_,ext = rendition.attrib['type'].partition('/')
3205             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3206             video_url = rendition.find('./src').text
3207         except KeyError:
3208             self._downloader.trouble('Invalid rendition field.')
3209             return
3210
3211         info = {
3212             'id': video_id,
3213             'url': video_url,
3214             'uploader': performer,
3215             'upload_date': None,
3216             'title': video_title,
3217             'ext': ext,
3218             'format': format,
3219         }
3220
3221         return [info]
3222
3223
3224 class YoukuIE(InfoExtractor):
3225
3226     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3227     IE_NAME = u'Youku'
3228
3229     def __init__(self, downloader=None):
3230         InfoExtractor.__init__(self, downloader)
3231
3232     def report_download_webpage(self, file_id):
3233         """Report webpage download."""
3234         self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3235
3236     def report_extraction(self, file_id):
3237         """Report information extraction."""
3238         self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3239
3240     def _gen_sid(self):
3241         nowTime = int(time.time() * 1000)
3242         random1 = random.randint(1000,1998)
3243         random2 = random.randint(1000,9999)
3244
3245         return "%d%d%d" %(nowTime,random1,random2)
3246
3247     def _get_file_ID_mix_string(self, seed):
3248         mixed = []
3249         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3250         seed = float(seed)
3251         for i in range(len(source)):
3252             seed  =  (seed * 211 + 30031 ) % 65536
3253             index  =  math.floor(seed / 65536 * len(source) )
3254             mixed.append(source[int(index)])
3255             source.remove(source[int(index)])
3256         #return ''.join(mixed)
3257         return mixed
3258
3259     def _get_file_id(self, fileId, seed):
3260         mixed = self._get_file_ID_mix_string(seed)
3261         ids = fileId.split('*')
3262         realId = []
3263         for ch in ids:
3264             if ch:
3265                 realId.append(mixed[int(ch)])
3266         return ''.join(realId)
3267
3268     def _real_extract(self, url):
3269         mobj = re.match(self._VALID_URL, url)
3270         if mobj is None:
3271             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3272             return
3273         video_id = mobj.group('ID')
3274
3275         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3276
3277         request = compat_urllib_request.Request(info_url, None, std_headers)
3278         try:
3279             self.report_download_webpage(video_id)
3280             jsondata = compat_urllib_request.urlopen(request).read()
3281         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3282             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3283             return
3284
3285         self.report_extraction(video_id)
3286         try:
3287             jsonstr = jsondata.decode('utf-8')
3288             config = json.loads(jsonstr)
3289
3290             video_title =  config['data'][0]['title']
3291             seed = config['data'][0]['seed']
3292
3293             format = self._downloader.params.get('format', None)
3294             supported_format = list(config['data'][0]['streamfileids'].keys())
3295
3296             if format is None or format == 'best':
3297                 if 'hd2' in supported_format:
3298                     format = 'hd2'
3299                 else:
3300                     format = 'flv'
3301                 ext = u'flv'
3302             elif format == 'worst':
3303                 format = 'mp4'
3304                 ext = u'mp4'
3305             else:
3306                 format = 'flv'
3307                 ext = u'flv'
3308
3309
3310             fileid = config['data'][0]['streamfileids'][format]
3311             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3312         except (UnicodeDecodeError, ValueError, KeyError):
3313             self._downloader.trouble(u'ERROR: unable to extract info section')
3314             return
3315
3316         files_info=[]
3317         sid = self._gen_sid()
3318         fileid = self._get_file_id(fileid, seed)
3319
3320         #column 8,9 of fileid represent the segment number
3321         #fileid[7:9] should be changed
3322         for index, key in enumerate(keys):
3323
3324             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3325             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3326
3327             info = {
3328                 'id': '%s_part%02d' % (video_id, index),
3329                 'url': download_url,
3330                 'uploader': None,
3331                 'upload_date': None,
3332                 'title': video_title,
3333                 'ext': ext,
3334             }
3335             files_info.append(info)
3336
3337         return files_info
3338
3339
3340 class XNXXIE(InfoExtractor):
3341     """Information extractor for xnxx.com"""
3342
3343     _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3344     IE_NAME = u'xnxx'
3345     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3346     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3347     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3348
3349     def report_webpage(self, video_id):
3350         """Report information extraction"""
3351         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3352
3353     def report_extraction(self, video_id):
3354         """Report information extraction"""
3355         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3356
3357     def _real_extract(self, url):
3358         mobj = re.match(self._VALID_URL, url)
3359         if mobj is None:
3360             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3361             return
3362         video_id = mobj.group(1)
3363
3364         self.report_webpage(video_id)
3365
3366         # Get webpage content
3367         try:
3368             webpage_bytes = compat_urllib_request.urlopen(url).read()
3369             webpage = webpage_bytes.decode('utf-8')
3370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3371             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3372             return
3373
3374         result = re.search(self.VIDEO_URL_RE, webpage)
3375         if result is None:
3376             self._downloader.trouble(u'ERROR: unable to extract video url')
3377             return
3378         video_url = compat_urllib_parse.unquote(result.group(1))
3379
3380         result = re.search(self.VIDEO_TITLE_RE, webpage)
3381         if result is None:
3382             self._downloader.trouble(u'ERROR: unable to extract video title')
3383             return
3384         video_title = result.group(1)
3385
3386         result = re.search(self.VIDEO_THUMB_RE, webpage)
3387         if result is None:
3388             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3389             return
3390         video_thumbnail = result.group(1)
3391
3392         return [{
3393             'id': video_id,
3394             'url': video_url,
3395             'uploader': None,
3396             'upload_date': None,
3397             'title': video_title,
3398             'ext': 'flv',
3399             'thumbnail': video_thumbnail,
3400             'description': None,
3401         }]
3402
3403
3404 class GooglePlusIE(InfoExtractor):
3405     """Information extractor for plus.google.com."""
3406
3407     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3408     IE_NAME = u'plus.google'
3409
3410     def __init__(self, downloader=None):
3411         InfoExtractor.__init__(self, downloader)
3412
3413     def report_extract_entry(self, url):
3414         """Report downloading extry"""
3415         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3416
3417     def report_date(self, upload_date):
3418         """Report downloading extry"""
3419         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3420
3421     def report_uploader(self, uploader):
3422         """Report downloading extry"""
3423         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3424
3425     def report_title(self, video_title):
3426         """Report downloading extry"""
3427         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3428
3429     def report_extract_vid_page(self, video_page):
3430         """Report information extraction."""
3431         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3432
3433     def _real_extract(self, url):
3434         # Extract id from URL
3435         mobj = re.match(self._VALID_URL, url)
3436         if mobj is None:
3437             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3438             return
3439
3440         post_url = mobj.group(0)
3441         video_id = mobj.group(1)
3442
3443         video_extension = 'flv'
3444
3445         # Step 1, Retrieve post webpage to extract further information
3446         self.report_extract_entry(post_url)
3447         request = compat_urllib_request.Request(post_url)
3448         try:
3449             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3452             return
3453
3454         # Extract update date
3455         upload_date = None
3456         pattern = 'title="Timestamp">(.*?)</a>'
3457         mobj = re.search(pattern, webpage)
3458         if mobj:
3459             upload_date = mobj.group(1)
3460             # Convert timestring to a format suitable for filename
3461             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3462             upload_date = upload_date.strftime('%Y%m%d')
3463         self.report_date(upload_date)
3464
3465         # Extract uploader
3466         uploader = None
3467         pattern = r'rel\="author".*?>(.*?)</a>'
3468         mobj = re.search(pattern, webpage)
3469         if mobj:
3470             uploader = mobj.group(1)
3471         self.report_uploader(uploader)
3472
3473         # Extract title
3474         # Get the first line for title
3475         video_title = u'NA'
3476         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3477         mobj = re.search(pattern, webpage)
3478         if mobj:
3479             video_title = mobj.group(1)
3480         self.report_title(video_title)
3481
3482         # Step 2, Stimulate clicking the image box to launch video
3483         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3484         mobj = re.search(pattern, webpage)
3485         if mobj is None:
3486             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3487
3488         video_page = mobj.group(1)
3489         request = compat_urllib_request.Request(video_page)
3490         try:
3491             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3492         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3494             return
3495         self.report_extract_vid_page(video_page)
3496
3497
3498         # Extract video links on video page
3499         """Extract video links of all sizes"""
3500         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3501         mobj = re.findall(pattern, webpage)
3502         if len(mobj) == 0:
3503             self._downloader.trouble(u'ERROR: unable to extract video links')
3504
3505         # Sort in resolution
3506         links = sorted(mobj)
3507
3508         # Choose the lowest of the sort, i.e. highest resolution
3509         video_url = links[-1]
3510         # Only get the url. The resolution part in the tuple has no use anymore
3511         video_url = video_url[-1]
3512         # Treat escaped \u0026 style hex
3513         try:
3514             video_url = video_url.decode("unicode_escape")
3515         except AttributeError: # Python 3
3516             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3517
3518
3519         return [{
3520             'id':       video_id,
3521             'url':      video_url,
3522             'uploader': uploader,
3523             'upload_date':  upload_date,
3524             'title':    video_title,
3525             'ext':      video_extension,
3526         }]
3527
3528 class NBAIE(InfoExtractor):
3529     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3530     IE_NAME = u'nba'
3531
3532     def report_extraction(self, video_id):
3533         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3534
3535     def _real_extract(self, url):
3536         mobj = re.match(self._VALID_URL, url)
3537         if mobj is None:
3538             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3539             return
3540
3541         video_id = mobj.group(1)
3542         if video_id.endswith('/index.html'):
3543             video_id = video_id[:-len('/index.html')]
3544
3545         self.report_extraction(video_id)
3546         try:
3547             urlh = compat_urllib_request.urlopen(url)
3548             webpage_bytes = urlh.read()
3549             webpage = webpage_bytes.decode('utf-8', 'ignore')
3550         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3551             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3552             return
3553
3554         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3555         def _findProp(rexp, default=None):
3556             m = re.search(rexp, webpage)
3557             if m:
3558                 return unescapeHTML(m.group(1))
3559             else:
3560                 return default
3561
3562         shortened_video_id = video_id.rpartition('/')[2]
3563         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3564         info = {
3565             'id': shortened_video_id,
3566             'url': video_url,
3567             'ext': 'mp4',
3568             'title': title,
3569             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3570             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3571         }
3572         return [info]
3573
3574 class JustinTVIE(InfoExtractor):
3575     """Information extractor for justin.tv and twitch.tv"""
3576     # TODO: One broadcast may be split into multiple videos. The key
3577     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3578     # starts at 1 and increases. Can we treat all parts as one video?
3579
3580     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3581         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3582     _JUSTIN_PAGE_LIMIT = 100
3583     IE_NAME = u'justin.tv'
3584
3585     def report_extraction(self, file_id):
3586         """Report information extraction."""
3587         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3588
3589     def report_download_page(self, channel, offset):
3590         """Report attempt to download a single page of videos."""
3591         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3592                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3593
3594     # Return count of items, list of *valid* items
3595     def _parse_page(self, url):
3596         try:
3597             urlh = compat_urllib_request.urlopen(url)
3598             webpage_bytes = urlh.read()
3599             webpage = webpage_bytes.decode('utf-8', 'ignore')
3600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3601             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3602             return
3603
3604         response = json.loads(webpage)
3605         info = []
3606         for clip in response:
3607             video_url = clip['video_file_url']
3608             if video_url:
3609                 video_extension = os.path.splitext(video_url)[1][1:]
3610                 video_date = re.sub('-', '', clip['created_on'][:10])
3611                 info.append({
3612                     'id': clip['id'],
3613                     'url': video_url,
3614                     'title': clip['title'],
3615                     'uploader': clip.get('user_id', clip.get('channel_id')),
3616                     'upload_date': video_date,
3617                     'ext': video_extension,
3618                 })
3619         return (len(response), info)
3620
3621     def _real_extract(self, url):
3622         mobj = re.match(self._VALID_URL, url)
3623         if mobj is None:
3624             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3625             return
3626
3627         api = 'http://api.justin.tv'
3628         video_id = mobj.group(mobj.lastindex)
3629         paged = False
3630         if mobj.lastindex == 1:
3631             paged = True
3632             api += '/channel/archives/%s.json'
3633         else:
3634             api += '/clip/show/%s.json'
3635         api = api % (video_id,)
3636
3637         self.report_extraction(video_id)
3638
3639         info = []
3640         offset = 0
3641         limit = self._JUSTIN_PAGE_LIMIT
3642         while True:
3643             if paged:
3644                 self.report_download_page(video_id, offset)
3645             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3646             page_count, page_info = self._parse_page(page_url)
3647             info.extend(page_info)
3648             if not paged or page_count != limit:
3649                 break
3650             offset += limit
3651         return info
3652
3653 class FunnyOrDieIE(InfoExtractor):
3654     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3655     IE_NAME = u'FunnyOrDie'
3656
3657     def report_extraction(self, video_id):
3658         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3659
3660     def _real_extract(self, url):
3661         mobj = re.match(self._VALID_URL, url)
3662         if mobj is None:
3663             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3664             return
3665
3666         video_id = mobj.group('id')
3667         self.report_extraction(video_id)
3668         try:
3669             urlh = compat_urllib_request.urlopen(url)
3670             webpage_bytes = urlh.read()
3671             webpage = webpage_bytes.decode('utf-8', 'ignore')
3672         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3673             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3674             return
3675
3676         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3677         if not m:
3678             self._downloader.trouble(u'ERROR: unable to find video information')
3679         video_url = unescapeHTML(m.group('url'))
3680
3681         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3682         if not m:
3683             self._downloader.trouble(u'Cannot find video title')
3684         title = unescapeHTML(m.group('title'))
3685
3686         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3687         if m:
3688             desc = unescapeHTML(m.group('desc'))
3689         else:
3690             desc = None
3691
3692         info = {
3693             'id': video_id,
3694             'url': video_url,
3695             'ext': 'mp4',
3696             'title': title,
3697             'description': desc,
3698         }
3699         return [info]
3700
3701 class TweetReelIE(InfoExtractor):
3702     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3703
3704     def report_extraction(self, video_id):
3705         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3706
3707     def _real_extract(self, url):
3708         mobj = re.match(self._VALID_URL, url)
3709         if mobj is None:
3710             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3711             return
3712
3713         video_id = mobj.group('id')
3714         self.report_extraction(video_id)
3715         try:
3716             urlh = compat_urllib_request.urlopen(url)
3717             webpage_bytes = urlh.read()
3718             webpage = webpage_bytes.decode('utf-8', 'ignore')
3719         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3720             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3721             return
3722
3723         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3724         if not m:
3725             self._downloader.trouble(u'ERROR: Cannot find status ID')
3726         status_id = m.group(1)
3727
3728         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3729         if not m:
3730             self._downloader.trouble(u'WARNING: Cannot find description')
3731         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3732
3733         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3734         if not m:
3735             self._downloader.trouble(u'ERROR: Cannot find uploader')
3736         uploader = unescapeHTML(m.group('uploader'))
3737         uploader_id = unescapeHTML(m.group('uploader_id'))
3738
3739         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3740         if not m:
3741             self._downloader.trouble(u'ERROR: Cannot find upload date')
3742         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3743
3744         title = desc
3745         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3746
3747         info = {
3748             'id': video_id,
3749             'url': video_url,
3750             'ext': 'mov',
3751             'title': title,
3752             'description': desc,
3753             'uploader': uploader,
3754             'uploader_id': uploader_id,
3755             'internal_id': status_id,
3756             'upload_date': upload_date
3757         }
3758         return [info]