_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import netrc
   6 import os
   7 import re
   8 import socket
   9 import time
  10 import email.utils
  11 import xml.etree.ElementTree
  12 import random
  13 import math
  14
  15 from utils import *
  16
  17
  18 class InfoExtractor(object):
  19         """Information Extractor class.
  20
  21         Information extractors are the classes that, given a URL, extract
  22         information about the video (or videos) the URL refers to. This
  23         information includes the real video URL, the video title, author and
  24         others. The information is stored in a dictionary which is then
  25         passed to the FileDownloader. The FileDownloader processes this
  26         information possibly downloading the video to the file system, among
  27         other possible outcomes.
  28
  29         The dictionaries must include the following fields:
  30
  31         id:             Video identifier.
  32         url:            Final video URL.
  33         uploader:       Nickname of the video uploader, unescaped.
  34         upload_date:    Video upload date (YYYYMMDD).
  35         title:          Video title, unescaped.
  36         ext:            Video filename extension.
  37
  38         The following fields are optional:
  39
  40         format:         The video format, defaults to ext (used for --get-format)
  41         thumbnail:      Full URL to a video thumbnail image.
  42         description:    One-line video description.
  43         player_url:     SWF Player URL (used for rtmpdump).
  44         subtitles:      The .srt file contents.
  45         urlhandle:              [internal] The urlHandle to be used to download the file,
  46                         like returned by urllib.request.urlopen
  47
  48         The fields should all be Unicode strings.
  49
  50         Subclasses of this one should re-define the _real_initialize() and
  51         _real_extract() methods and define a _VALID_URL regexp.
  52         Probably, they should also be added to the list of extractors.
  53
  54         _real_extract() must return a *list* of information dictionaries as
  55         described above.
  56
  57         Finally, the _WORKING attribute should be set to False for broken IEs
  58         in order to warn the users and skip the tests.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63         _WORKING = True
  64
  65         def __init__(self, downloader=None):
  66                 """Constructor. Receives an optional downloader."""
  67                 self._ready = False
  68                 self.set_downloader(downloader)
  69
  70         def suitable(self, url):
  71                 """Receives a URL and returns True if suitable for this IE."""
  72                 return re.match(self._VALID_URL, url) is not None
  73
  74         def working(self):
  75                 """Getter method for _WORKING."""
  76                 return self._WORKING
  77
  78         def initialize(self):
  79                 """Initializes an instance (authentication, etc)."""
  80                 if not self._ready:
  81                         self._real_initialize()
  82                         self._ready = True
  83
  84         def extract(self, url):
  85                 """Extracts URL information and returns it in list of dicts."""
  86                 self.initialize()
  87                 return self._real_extract(url)
  88
  89         def set_downloader(self, downloader):
  90                 """Sets the downloader for this IE."""
  91                 self._downloader = downloader
  92
  93         def _real_initialize(self):
  94                 """Real initialization process. Redefine in subclasses."""
  95                 pass
  96
  97         def _real_extract(self, url):
  98                 """Real extraction process. Redefine in subclasses."""
  99                 pass
 100
 101
 102 class YoutubeIE(InfoExtractor):
 103         """Information extractor for youtube.com."""
 104
 105         _VALID_URL = r"""^
 106                          (
 107                              (?:https?://)?                                       # http(s):// (optional)
 108                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 109                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 110                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 111                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 112                              (?:                                                  # the various things that can precede the ID:
 113                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 114                                  |(?:                                             # or the v= param in all its forms
 115                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 116                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 117                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 118                                      v=
 119                                  )
 120                              )?                                                   # optional -> youtube.com/xxxx is OK
 121                          )?                                                       # all until now is optional -> you can pass the naked ID
 122                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 123                          (?(1).+)?                                                # if we found the ID, everything can follow
 124                          $"""
 125         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 126         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 127         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 128         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 129         _NETRC_MACHINE = 'youtube'
 130         # Listed in order of quality
 131         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 132         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 133         _video_extensions = {
 134                 '13': '3gp',
 135                 '17': 'mp4',
 136                 '18': 'mp4',
 137                 '22': 'mp4',
 138                 '37': 'mp4',
 139                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 140                 '43': 'webm',
 141                 '44': 'webm',
 142                 '45': 'webm',
 143                 '46': 'webm',
 144         }
 145         _video_dimensions = {
 146                 '5': '240x400',
 147                 '6': '???',
 148                 '13': '???',
 149                 '17': '144x176',
 150                 '18': '360x640',
 151                 '22': '720x1280',
 152                 '34': '360x640',
 153                 '35': '480x854',
 154                 '37': '1080x1920',
 155                 '38': '3072x4096',
 156                 '43': '360x640',
 157                 '44': '480x854',
 158                 '45': '720x1280',
 159                 '46': '1080x1920',
 160         }
 161         IE_NAME = u'youtube'
 162
 163         def suitable(self, url):
 164                 """Receives a URL and returns True if suitable for this IE."""
 165                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 166
 167         def report_lang(self):
 168                 """Report attempt to set language."""
 169                 self._downloader.to_screen(u'[youtube] Setting language')
 170
 171         def report_login(self):
 172                 """Report attempt to log in."""
 173                 self._downloader.to_screen(u'[youtube] Logging in')
 174
 175         def report_age_confirmation(self):
 176                 """Report attempt to confirm age."""
 177                 self._downloader.to_screen(u'[youtube] Confirming age')
 178
 179         def report_video_webpage_download(self, video_id):
 180                 """Report attempt to download video webpage."""
 181                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 182
 183         def report_video_info_webpage_download(self, video_id):
 184                 """Report attempt to download video info webpage."""
 185                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 186
 187         def report_video_subtitles_download(self, video_id):
 188                 """Report attempt to download video info webpage."""
 189                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 190
 191         def report_information_extraction(self, video_id):
 192                 """Report attempt to extract video information."""
 193                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 194
 195         def report_unavailable_format(self, video_id, format):
 196                 """Report extracted video URL."""
 197                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 198
 199         def report_rtmp_download(self):
 200                 """Indicate the download will use the RTMP protocol."""
 201                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 202
 203         def _closed_captions_xml_to_srt(self, xml_string):
 204                 srt = ''
 205                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 206                 # TODO parse xml instead of regex
 207                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 208                         if not dur: dur = '4'
 209                         start = float(start)
 210                         end = start + float(dur)
 211                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 212                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 213                         caption = unescapeHTML(caption)
 214                         caption = unescapeHTML(caption) # double cycle, intentional
 215                         srt += str(n+1) + '\n'
 216                         srt += start + ' --> ' + end + '\n'
 217                         srt += caption + '\n\n'
 218                 return srt
 219
 220         def _print_formats(self, formats):
 221                 print('Available formats:')
 222                 for x in formats:
 223                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 224
 225         def _real_initialize(self):
 226                 if self._downloader is None:
 227                         return
 228
 229                 username = None
 230                 password = None
 231                 downloader_params = self._downloader.params
 232
 233                 # Attempt to use provided username and password or .netrc data
 234                 if downloader_params.get('username', None) is not None:
 235                         username = downloader_params['username']
 236                         password = downloader_params['password']
 237                 elif downloader_params.get('usenetrc', False):
 238                         try:
 239                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 240                                 if info is not None:
 241                                         username = info[0]
 242                                         password = info[2]
 243                                 else:
 244                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 245                         except (IOError, netrc.NetrcParseError) as err:
 246                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 247                                 return
 248
 249                 # Set language
 250                 request = compat_urllib_request.Request(self._LANG_URL)
 251                 try:
 252                         self.report_lang()
 253                         compat_urllib_request.urlopen(request).read()
 254                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 255                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 256                         return
 257
 258                 # No authentication to be performed
 259                 if username is None:
 260                         return
 261
 262                 # Log in
 263                 login_form = {
 264                                 'current_form': 'loginForm',
 265                                 'next':         '/',
 266                                 'action_login': 'Log In',
 267                                 'username':     username,
 268                                 'password':     password,
 269                                 }
 270                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 271                 try:
 272                         self.report_login()
 273                         login_results = compat_urllib_request.urlopen(request).read()
 274                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 275                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 276                                 return
 277                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 278                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 279                         return
 280
 281                 # Confirm age
 282                 age_form = {
 283                                 'next_url':             '/',
 284                                 'action_confirm':       'Confirm',
 285                                 }
 286                 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 287                 try:
 288                         self.report_age_confirmation()
 289                         age_results = compat_urllib_request.urlopen(request).read()
 290                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 291                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 292                         return
 293
 294         def _real_extract(self, url):
 295                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 296                 mobj = re.search(self._NEXT_URL_RE, url)
 297                 if mobj:
 298                         url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 299
 300                 # Extract video id from URL
 301                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 302                 if mobj is None:
 303                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 304                         return
 305                 video_id = mobj.group(2)
 306
 307                 # Get video webpage
 308                 self.report_video_webpage_download(video_id)
 309                 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 310                 try:
 311                         video_webpage = compat_urllib_request.urlopen(request).read()
 312                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 313                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 314                         return
 315
 316                 # Attempt to extract SWF player URL
 317                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 318                 if mobj is not None:
 319                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 320                 else:
 321                         player_url = None
 322
 323                 # Get video info
 324                 self.report_video_info_webpage_download(video_id)
 325                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 326                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 327                                         % (video_id, el_type))
 328                         request = compat_urllib_request.Request(video_info_url)
 329                         try:
 330                                 video_info_webpage = compat_urllib_request.urlopen(request).read()
 331                                 video_info = compat_parse_qs(video_info_webpage)
 332                                 if 'token' in video_info:
 333                                         break
 334                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 335                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 336                                 return
 337                 if 'token' not in video_info:
 338                         if 'reason' in video_info:
 339                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 340                         else:
 341                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 342                         return
 343
 344                 # Check for "rental" videos
 345                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 346                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 347                         return
 348
 349                 # Start extracting information
 350                 self.report_information_extraction(video_id)
 351
 352                 # uploader
 353                 if 'author' not in video_info:
 354                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 355                         return
 356                 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 357
 358                 # title
 359                 if 'title' not in video_info:
 360                         self._downloader.trouble(u'ERROR: unable to extract video title')
 361                         return
 362                 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 363                 video_title = video_title.decode('utf-8')
 364
 365                 # thumbnail image
 366                 if 'thumbnail_url' not in video_info:
 367                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 368                         video_thumbnail = ''
 369                 else:   # don't panic if we can't find it
 370                         video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 371
 372                 # upload date
 373                 upload_date = None
 374                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 375                 if mobj is not None:
 376                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 377                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 378                         for expression in format_expressions:
 379                                 try:
 380                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 381                                 except:
 382                                         pass
 383
 384                 # description
 385                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 386                 if video_description: video_description = clean_html(video_description)
 387                 else: video_description = ''
 388
 389                 # closed captions
 390                 video_subtitles = None
 391                 if self._downloader.params.get('writesubtitles', False):
 392                         try:
 393                                 self.report_video_subtitles_download(video_id)
 394                                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 395                                 try:
 396                                         srt_list = compat_urllib_request.urlopen(request).read()
 397                                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 398                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 399                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 400                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 401                                 if not srt_lang_list:
 402                                         raise Trouble(u'WARNING: video has no closed captions')
 403                                 if self._downloader.params.get('subtitleslang', False):
 404                                         srt_lang = self._downloader.params.get('subtitleslang')
 405                                 elif 'en' in srt_lang_list:
 406                                         srt_lang = 'en'
 407                                 else:
 408                                         srt_lang = srt_lang_list.keys()[0]
 409                                 if not srt_lang in srt_lang_list:
 410                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 411                                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 412                                 try:
 413                                         srt_xml = compat_urllib_request.urlopen(request).read()
 414                                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 416                                 if not srt_xml:
 417                                         raise Trouble(u'WARNING: unable to download video subtitles')
 418                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 419                         except Trouble as trouble:
 420                                 self._downloader.trouble(trouble[0])
 421
 422                 if 'length_seconds' not in video_info:
 423                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 424                         video_duration = ''
 425                 else:
 426                         video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 427
 428                 # token
 429                 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 430
 431                 # Decide which formats to download
 432                 req_format = self._downloader.params.get('format', None)
 433
 434                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 435                         self.report_rtmp_download()
 436                         video_url_list = [(None, video_info['conn'][0])]
 437                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 438                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 439                         url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 440                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 441                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 442
 443                         format_limit = self._downloader.params.get('format_limit', None)
 444                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 445                         if format_limit is not None and format_limit in available_formats:
 446                                 format_list = available_formats[available_formats.index(format_limit):]
 447                         else:
 448                                 format_list = available_formats
 449                         existing_formats = [x for x in format_list if x in url_map]
 450                         if len(existing_formats) == 0:
 451                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 452                                 return
 453                         if self._downloader.params.get('listformats', None):
 454                                 self._print_formats(existing_formats)
 455                                 return
 456                         if req_format is None or req_format == 'best':
 457                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 458                         elif req_format == 'worst':
 459                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 460                         elif req_format in ('-1', 'all'):
 461                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 462                         else:
 463                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 464                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 465                                 req_formats = req_format.split('/')
 466                                 video_url_list = None
 467                                 for rf in req_formats:
 468                                         if rf in url_map:
 469                                                 video_url_list = [(rf, url_map[rf])]
 470                                                 break
 471                                 if video_url_list is None:
 472                                         self._downloader.trouble(u'ERROR: requested format not available')
 473                                         return
 474                 else:
 475                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 476                         return
 477
 478                 results = []
 479                 for format_param, video_real_url in video_url_list:
 480                         # Extension
 481                         video_extension = self._video_extensions.get(format_param, 'flv')
 482
 483                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
 484                                                             self._video_dimensions.get(format_param, '???'))
 485
 486                         results.append({
 487                                 'id':           video_id.decode('utf-8'),
 488                                 'url':          video_real_url.decode('utf-8'),
 489                                 'uploader':     video_uploader.decode('utf-8'),
 490                                 'upload_date':  upload_date,
 491                                 'title':        video_title,
 492                                 'ext':          video_extension.decode('utf-8'),
 493                                 'format':       video_format,
 494                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 495                                 'description':  video_description,
 496                                 'player_url':   player_url,
 497                                 'subtitles':    video_subtitles,
 498                                 'duration':             video_duration
 499                         })
 500                 return results
 501
 502
 503 class MetacafeIE(InfoExtractor):
 504         """Information Extractor for metacafe.com."""
 505
 506         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 507         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 508         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 509         IE_NAME = u'metacafe'
 510
 511         def __init__(self, downloader=None):
 512                 InfoExtractor.__init__(self, downloader)
 513
 514         def report_disclaimer(self):
 515                 """Report disclaimer retrieval."""
 516                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 517
 518         def report_age_confirmation(self):
 519                 """Report attempt to confirm age."""
 520                 self._downloader.to_screen(u'[metacafe] Confirming age')
 521
 522         def report_download_webpage(self, video_id):
 523                 """Report webpage download."""
 524                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 525
 526         def report_extraction(self, video_id):
 527                 """Report information extraction."""
 528                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 529
 530         def _real_initialize(self):
 531                 # Retrieve disclaimer
 532                 request = compat_urllib_request.Request(self._DISCLAIMER)
 533                 try:
 534                         self.report_disclaimer()
 535                         disclaimer = compat_urllib_request.urlopen(request).read()
 536                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 537                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 538                         return
 539
 540                 # Confirm age
 541                 disclaimer_form = {
 542                         'filters': '0',
 543                         'submit': "Continue - I'm over 18",
 544                         }
 545                 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 546                 try:
 547                         self.report_age_confirmation()
 548                         disclaimer = compat_urllib_request.urlopen(request).read()
 549                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 550                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 551                         return
 552
 553         def _real_extract(self, url):
 554                 # Extract id and simplified title from URL
 555                 mobj = re.match(self._VALID_URL, url)
 556                 if mobj is None:
 557                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 558                         return
 559
 560                 video_id = mobj.group(1)
 561
 562                 # Check if video comes from YouTube
 563                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 564                 if mobj2 is not None:
 565                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 566                         return
 567
 568                 # Retrieve video webpage to extract further information
 569                 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 570                 try:
 571                         self.report_download_webpage(video_id)
 572                         webpage = compat_urllib_request.urlopen(request).read()
 573                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 574                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 575                         return
 576
 577                 # Extract URL, uploader and title from webpage
 578                 self.report_extraction(video_id)
 579                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 580                 if mobj is not None:
 581                         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 582                         video_extension = mediaURL[-3:]
 583
 584                         # Extract gdaKey if available
 585                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 586                         if mobj is None:
 587                                 video_url = mediaURL
 588                         else:
 589                                 gdaKey = mobj.group(1)
 590                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 591                 else:
 592                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 593                         if mobj is None:
 594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 595                                 return
 596                         vardict = compat_parse_qs(mobj.group(1))
 597                         if 'mediaData' not in vardict:
 598                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 599                                 return
 600                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 601                         if mobj is None:
 602                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 603                                 return
 604                         mediaURL = mobj.group(1).replace('\\/', '/')
 605                         video_extension = mediaURL[-3:]
 606                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 607
 608                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 609                 if mobj is None:
 610                         self._downloader.trouble(u'ERROR: unable to extract title')
 611                         return
 612                 video_title = mobj.group(1).decode('utf-8')
 613
 614                 mobj = re.search(r'submitter=(.*?);', webpage)
 615                 if mobj is None:
 616                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 617                         return
 618                 video_uploader = mobj.group(1)
 619
 620                 return [{
 621                         'id':           video_id.decode('utf-8'),
 622                         'url':          video_url.decode('utf-8'),
 623                         'uploader':     video_uploader.decode('utf-8'),
 624                         'upload_date':  None,
 625                         'title':        video_title,
 626                         'ext':          video_extension.decode('utf-8'),
 627                 }]
 628
 629
 630 class DailymotionIE(InfoExtractor):
 631         """Information Extractor for Dailymotion"""
 632
 633         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 634         IE_NAME = u'dailymotion'
 635
 636         def __init__(self, downloader=None):
 637                 InfoExtractor.__init__(self, downloader)
 638
 639         def report_download_webpage(self, video_id):
 640                 """Report webpage download."""
 641                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 642
 643         def report_extraction(self, video_id):
 644                 """Report information extraction."""
 645                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 646
 647         def _real_extract(self, url):
 648                 # Extract id and simplified title from URL
 649                 mobj = re.match(self._VALID_URL, url)
 650                 if mobj is None:
 651                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 652                         return
 653
 654                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 655
 656                 video_extension = 'mp4'
 657
 658                 # Retrieve video webpage to extract further information
 659                 request = compat_urllib_request.Request(url)
 660                 request.add_header('Cookie', 'family_filter=off')
 661                 try:
 662                         self.report_download_webpage(video_id)
 663                         webpage = compat_urllib_request.urlopen(request).read()
 664                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 665                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 666                         return
 667
 668                 # Extract URL, uploader and title from webpage
 669                 self.report_extraction(video_id)
 670                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 671                 if mobj is None:
 672                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 673                         return
 674                 flashvars = compat_urllib_parse.unquote(mobj.group(1))
 675
 676                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 677                         if key in flashvars:
 678                                 max_quality = key
 679                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 680                                 break
 681                 else:
 682                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 683                         return
 684
 685                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 686                 if mobj is None:
 687                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 688                         return
 689
 690                 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 691
 692                 # TODO: support choosing qualities
 693
 694                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 695                 if mobj is None:
 696                         self._downloader.trouble(u'ERROR: unable to extract title')
 697                         return
 698                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 699
 700                 video_uploader = None
 701                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 702                 if mobj is None:
 703                         # lookin for official user
 704                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 705                         if mobj_official is None:
 706                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 707                         else:
 708                                 video_uploader = mobj_official.group(1)
 709                 else:
 710                         video_uploader = mobj.group(1)
 711
 712                 video_upload_date = None
 713                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 714                 if mobj is not None:
 715                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 716
 717                 return [{
 718                         'id':           video_id.decode('utf-8'),
 719                         'url':          video_url.decode('utf-8'),
 720                         'uploader':     video_uploader.decode('utf-8'),
 721                         'upload_date':  video_upload_date,
 722                         'title':        video_title,
 723                         'ext':          video_extension.decode('utf-8'),
 724                 }]
 725
 726
 727 class GoogleIE(InfoExtractor):
 728         """Information extractor for video.google.com."""
 729
 730         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 731         IE_NAME = u'video.google'
 732
 733         def __init__(self, downloader=None):
 734                 InfoExtractor.__init__(self, downloader)
 735
 736         def report_download_webpage(self, video_id):
 737                 """Report webpage download."""
 738                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 739
 740         def report_extraction(self, video_id):
 741                 """Report information extraction."""
 742                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 743
 744         def _real_extract(self, url):
 745                 # Extract id from URL
 746                 mobj = re.match(self._VALID_URL, url)
 747                 if mobj is None:
 748                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 749                         return
 750
 751                 video_id = mobj.group(1)
 752
 753                 video_extension = 'mp4'
 754
 755                 # Retrieve video webpage to extract further information
 756                 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 757                 try:
 758                         self.report_download_webpage(video_id)
 759                         webpage = compat_urllib_request.urlopen(request).read()
 760                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 761                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 762                         return
 763
 764                 # Extract URL, uploader, and title from webpage
 765                 self.report_extraction(video_id)
 766                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 767                 if mobj is None:
 768                         video_extension = 'flv'
 769                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 770                 if mobj is None:
 771                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 772                         return
 773                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 774                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 775                 mediaURL = mediaURL.replace('\\x26', '\x26')
 776
 777                 video_url = mediaURL
 778
 779                 mobj = re.search(r'<title>(.*)</title>', webpage)
 780                 if mobj is None:
 781                         self._downloader.trouble(u'ERROR: unable to extract title')
 782                         return
 783                 video_title = mobj.group(1).decode('utf-8')
 784
 785                 # Extract video description
 786                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 787                 if mobj is None:
 788                         self._downloader.trouble(u'ERROR: unable to extract video description')
 789                         return
 790                 video_description = mobj.group(1).decode('utf-8')
 791                 if not video_description:
 792                         video_description = 'No description available.'
 793
 794                 # Extract video thumbnail
 795                 if self._downloader.params.get('forcethumbnail', False):
 796                         request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 797                         try:
 798                                 webpage = compat_urllib_request.urlopen(request).read()
 799                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 800                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 801                                 return
 802                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 803                         if mobj is None:
 804                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 805                                 return
 806                         video_thumbnail = mobj.group(1)
 807                 else:   # we need something to pass to process_info
 808                         video_thumbnail = ''
 809
 810                 return [{
 811                         'id':           video_id.decode('utf-8'),
 812                         'url':          video_url.decode('utf-8'),
 813                         'uploader':     None,
 814                         'upload_date':  None,
 815                         'title':        video_title,
 816                         'ext':          video_extension.decode('utf-8'),
 817                 }]
 818
 819
 820 class PhotobucketIE(InfoExtractor):
 821         """Information extractor for photobucket.com."""
 822
 823         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 824         IE_NAME = u'photobucket'
 825
 826         def __init__(self, downloader=None):
 827                 InfoExtractor.__init__(self, downloader)
 828
 829         def report_download_webpage(self, video_id):
 830                 """Report webpage download."""
 831                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 832
 833         def report_extraction(self, video_id):
 834                 """Report information extraction."""
 835                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 836
 837         def _real_extract(self, url):
 838                 # Extract id from URL
 839                 mobj = re.match(self._VALID_URL, url)
 840                 if mobj is None:
 841                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 842                         return
 843
 844                 video_id = mobj.group(1)
 845
 846                 video_extension = 'flv'
 847
 848                 # Retrieve video webpage to extract further information
 849                 request = compat_urllib_request.Request(url)
 850                 try:
 851                         self.report_download_webpage(video_id)
 852                         webpage = compat_urllib_request.urlopen(request).read()
 853                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 854                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 855                         return
 856
 857                 # Extract URL, uploader, and title from webpage
 858                 self.report_extraction(video_id)
 859                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 860                 if mobj is None:
 861                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 862                         return
 863                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 864
 865                 video_url = mediaURL
 866
 867                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 868                 if mobj is None:
 869                         self._downloader.trouble(u'ERROR: unable to extract title')
 870                         return
 871                 video_title = mobj.group(1).decode('utf-8')
 872
 873                 video_uploader = mobj.group(2).decode('utf-8')
 874
 875                 return [{
 876                         'id':           video_id.decode('utf-8'),
 877                         'url':          video_url.decode('utf-8'),
 878                         'uploader':     video_uploader,
 879                         'upload_date':  None,
 880                         'title':        video_title,
 881                         'ext':          video_extension.decode('utf-8'),
 882                 }]
 883
 884
 885 class YahooIE(InfoExtractor):
 886         """Information extractor for video.yahoo.com."""
 887
 888         # _VALID_URL matches all Yahoo! Video URLs
 889         # _VPAGE_URL matches only the extractable '/watch/' URLs
 890         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 891         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 892         IE_NAME = u'video.yahoo'
 893
 894         def __init__(self, downloader=None):
 895                 InfoExtractor.__init__(self, downloader)
 896
 897         def report_download_webpage(self, video_id):
 898                 """Report webpage download."""
 899                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 900
 901         def report_extraction(self, video_id):
 902                 """Report information extraction."""
 903                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 904
 905         def _real_extract(self, url, new_video=True):
 906                 # Extract ID from URL
 907                 mobj = re.match(self._VALID_URL, url)
 908                 if mobj is None:
 909                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 910                         return
 911
 912                 video_id = mobj.group(2)
 913                 video_extension = 'flv'
 914
 915                 # Rewrite valid but non-extractable URLs as
 916                 # extractable English language /watch/ URLs
 917                 if re.match(self._VPAGE_URL, url) is None:
 918                         request = compat_urllib_request.Request(url)
 919                         try:
 920                                 webpage = compat_urllib_request.urlopen(request).read()
 921                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 922                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 923                                 return
 924
 925                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 926                         if mobj is None:
 927                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 928                                 return
 929                         yahoo_id = mobj.group(1)
 930
 931                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 932                         if mobj is None:
 933                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 934                                 return
 935                         yahoo_vid = mobj.group(1)
 936
 937                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 938                         return self._real_extract(url, new_video=False)
 939
 940                 # Retrieve video webpage to extract further information
 941                 request = compat_urllib_request.Request(url)
 942                 try:
 943                         self.report_download_webpage(video_id)
 944                         webpage = compat_urllib_request.urlopen(request).read()
 945                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 946                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 947                         return
 948
 949                 # Extract uploader and title from webpage
 950                 self.report_extraction(video_id)
 951                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video title')
 954                         return
 955                 video_title = mobj.group(1).decode('utf-8')
 956
 957                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 958                 if mobj is None:
 959                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 960                         return
 961                 video_uploader = mobj.group(1).decode('utf-8')
 962
 963                 # Extract video thumbnail
 964                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 965                 if mobj is None:
 966                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 967                         return
 968                 video_thumbnail = mobj.group(1).decode('utf-8')
 969
 970                 # Extract video description
 971                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 972                 if mobj is None:
 973                         self._downloader.trouble(u'ERROR: unable to extract video description')
 974                         return
 975                 video_description = mobj.group(1).decode('utf-8')
 976                 if not video_description:
 977                         video_description = 'No description available.'
 978
 979                 # Extract video height and width
 980                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 981                 if mobj is None:
 982                         self._downloader.trouble(u'ERROR: unable to extract video height')
 983                         return
 984                 yv_video_height = mobj.group(1)
 985
 986                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 987                 if mobj is None:
 988                         self._downloader.trouble(u'ERROR: unable to extract video width')
 989                         return
 990                 yv_video_width = mobj.group(1)
 991
 992                 # Retrieve video playlist to extract media URL
 993                 # I'm not completely sure what all these options are, but we
 994                 # seem to need most of them, otherwise the server sends a 401.
 995                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 996                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 997                 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 998                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 999                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000                 try:
1001                         self.report_download_webpage(video_id)
1002                         webpage = compat_urllib_request.urlopen(request).read()
1003                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005                         return
1006
1007                 # Extract media URL from playlist XML
1008                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011                         return
1012                 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013                 video_url = unescapeHTML(video_url)
1014
1015                 return [{
1016                         'id':           video_id.decode('utf-8'),
1017                         'url':          video_url,
1018                         'uploader':     video_uploader,
1019                         'upload_date':  None,
1020                         'title':        video_title,
1021                         'ext':          video_extension.decode('utf-8'),
1022                         'thumbnail':    video_thumbnail.decode('utf-8'),
1023                         'description':  video_description,
1024                 }]
1025
1026
1027 class VimeoIE(InfoExtractor):
1028         """Information extractor for vimeo.com."""
1029
1030         # _VALID_URL matches Vimeo URLs
1031         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1032         IE_NAME = u'vimeo'
1033
1034         def __init__(self, downloader=None):
1035                 InfoExtractor.__init__(self, downloader)
1036
1037         def report_download_webpage(self, video_id):
1038                 """Report webpage download."""
1039                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1040
1041         def report_extraction(self, video_id):
1042                 """Report information extraction."""
1043                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1044
1045         def _real_extract(self, url, new_video=True):
1046                 # Extract ID from URL
1047                 mobj = re.match(self._VALID_URL, url)
1048                 if mobj is None:
1049                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1050                         return
1051
1052                 video_id = mobj.group(1)
1053
1054                 # Retrieve video webpage to extract further information
1055                 request = compat_urllib_request.Request(url, None, std_headers)
1056                 try:
1057                         self.report_download_webpage(video_id)
1058                         webpage = compat_urllib_request.urlopen(request).read()
1059                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1060                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1061                         return
1062
1063                 # Now we begin extracting as much information as we can from what we
1064                 # retrieved. First we extract the information common to all extractors,
1065                 # and latter we extract those that are Vimeo specific.
1066                 self.report_extraction(video_id)
1067
1068                 # Extract the config JSON
1069                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1070                 try:
1071                         config = json.loads(config)
1072                 except:
1073                         self._downloader.trouble(u'ERROR: unable to extract info section')
1074                         return
1075
1076                 # Extract title
1077                 video_title = config["video"]["title"]
1078
1079                 # Extract uploader
1080                 video_uploader = config["video"]["owner"]["name"]
1081
1082                 # Extract video thumbnail
1083                 video_thumbnail = config["video"]["thumbnail"]
1084
1085                 # Extract video description
1086                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1087                 if video_description: video_description = clean_html(video_description)
1088                 else: video_description = ''
1089
1090                 # Extract upload date
1091                 video_upload_date = None
1092                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1093                 if mobj is not None:
1094                         video_upload_date = mobj.group(1)
1095
1096                 # Vimeo specific: extract request signature and timestamp
1097                 sig = config['request']['signature']
1098                 timestamp = config['request']['timestamp']
1099
1100                 # Vimeo specific: extract video codec and quality information
1101                 # First consider quality, then codecs, then take everything
1102                 # TODO bind to format param
1103                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1104                 files = { 'hd': [], 'sd': [], 'other': []}
1105                 for codec_name, codec_extension in codecs:
1106                         if codec_name in config["video"]["files"]:
1107                                 if 'hd' in config["video"]["files"][codec_name]:
1108                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1109                                 elif 'sd' in config["video"]["files"][codec_name]:
1110                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1111                                 else:
1112                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1113
1114                 for quality in ('hd', 'sd', 'other'):
1115                         if len(files[quality]) > 0:
1116                                 video_quality = files[quality][0][2]
1117                                 video_codec = files[quality][0][0]
1118                                 video_extension = files[quality][0][1]
1119                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1120                                 break
1121                 else:
1122                         self._downloader.trouble(u'ERROR: no known codec found')
1123                         return
1124
1125                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1126                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1127
1128                 return [{
1129                         'id':           video_id,
1130                         'url':          video_url,
1131                         'uploader':     video_uploader,
1132                         'upload_date':  video_upload_date,
1133                         'title':        video_title,
1134                         'ext':          video_extension,
1135                         'thumbnail':    video_thumbnail,
1136                         'description':  video_description,
1137                 }]
1138
1139
1140 class ArteTvIE(InfoExtractor):
1141         """arte.tv information extractor."""
1142
1143         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1144         _LIVE_URL = r'index-[0-9]+\.html$'
1145
1146         IE_NAME = u'arte.tv'
1147
1148         def __init__(self, downloader=None):
1149                 InfoExtractor.__init__(self, downloader)
1150
1151         def report_download_webpage(self, video_id):
1152                 """Report webpage download."""
1153                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1154
1155         def report_extraction(self, video_id):
1156                 """Report information extraction."""
1157                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1158
1159         def fetch_webpage(self, url):
1160                 self._downloader.increment_downloads()
1161                 request = compat_urllib_request.Request(url)
1162                 try:
1163                         self.report_download_webpage(url)
1164                         webpage = compat_urllib_request.urlopen(request).read()
1165                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1166                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1167                         return
1168                 except ValueError as err:
1169                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1170                         return
1171                 return webpage
1172
1173         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1174                 page = self.fetch_webpage(url)
1175                 mobj = re.search(regex, page, regexFlags)
1176                 info = {}
1177
1178                 if mobj is None:
1179                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180                         return
1181
1182                 for (i, key, err) in matchTuples:
1183                         if mobj.group(i) is None:
1184                                 self._downloader.trouble(err)
1185                                 return
1186                         else:
1187                                 info[key] = mobj.group(i)
1188
1189                 return info
1190
1191         def extractLiveStream(self, url):
1192                 video_lang = url.split('/')[-4]
1193                 info = self.grep_webpage(
1194                         url,
1195                         r'src="(.*?/videothek_js.*?\.js)',
1196                         0,
1197                         [
1198                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1199                         ]
1200                 )
1201                 http_host = url.split('/')[2]
1202                 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1203                 info = self.grep_webpage(
1204                         next_url,
1205                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1206                                 '(http://.*?\.swf).*?' +
1207                                 '(rtmp://.*?)\'',
1208                         re.DOTALL,
1209                         [
1210                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1211                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1212                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1213                         ]
1214                 )
1215                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1216
1217         def extractPlus7Stream(self, url):
1218                 video_lang = url.split('/')[-3]
1219                 info = self.grep_webpage(
1220                         url,
1221                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1222                         0,
1223                         [
1224                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1225                         ]
1226                 )
1227                 next_url = compat_urllib_parse.unquote(info.get('url'))
1228                 info = self.grep_webpage(
1229                         next_url,
1230                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1231                         0,
1232                         [
1233                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1234                         ]
1235                 )
1236                 next_url = compat_urllib_parse.unquote(info.get('url'))
1237
1238                 info = self.grep_webpage(
1239                         next_url,
1240                         r'<video id="(.*?)".*?>.*?' +
1241                                 '<name>(.*?)</name>.*?' +
1242                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1243                                 '<url quality="hd">(.*?)</url>',
1244                         re.DOTALL,
1245                         [
1246                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1247                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1248                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1249                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1250                         ]
1251                 )
1252
1253                 return {
1254                         'id':           info.get('id'),
1255                         'url':          compat_urllib_parse.unquote(info.get('url')),
1256                         'uploader':     u'arte.tv',
1257                         'upload_date':  info.get('date'),
1258                         'title':        info.get('title'),
1259                         'ext':          u'mp4',
1260                         'format':       u'NA',
1261                         'player_url':   None,
1262                 }
1263
1264         def _real_extract(self, url):
1265                 video_id = url.split('/')[-1]
1266                 self.report_extraction(video_id)
1267
1268                 if re.search(self._LIVE_URL, video_id) is not None:
1269                         self.extractLiveStream(url)
1270                         return
1271                 else:
1272                         info = self.extractPlus7Stream(url)
1273
1274                 return [info]
1275
1276
1277 class GenericIE(InfoExtractor):
1278         """Generic last-resort information extractor."""
1279
1280         _VALID_URL = r'.*'
1281         IE_NAME = u'generic'
1282
1283         def __init__(self, downloader=None):
1284                 InfoExtractor.__init__(self, downloader)
1285
1286         def report_download_webpage(self, video_id):
1287                 """Report webpage download."""
1288                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1289                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1290
1291         def report_extraction(self, video_id):
1292                 """Report information extraction."""
1293                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1294
1295         def report_following_redirect(self, new_url):
1296                 """Report information extraction."""
1297                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1298
1299         def _test_redirect(self, url):
1300                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1301                 class HeadRequest(compat_urllib_request.Request):
1302                         def get_method(self):
1303                                 return "HEAD"
1304
1305                 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1306                         """
1307                         Subclass the HTTPRedirectHandler to make it use our
1308                         HeadRequest also on the redirected URL
1309                         """
1310                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1311                                 if code in (301, 302, 303, 307):
1312                                         newurl = newurl.replace(' ', '%20')
1313                                         newheaders = dict((k,v) for k,v in req.headers.items()
1314                                                                           if k.lower() not in ("content-length", "content-type"))
1315                                         return HeadRequest(newurl,
1316                                                                            headers=newheaders,
1317                                                                            origin_req_host=req.get_origin_req_host(),
1318                                                                            unverifiable=True)
1319                                 else:
1320                                         raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1321
1322                 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1323                         """
1324                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1325                         """
1326                         def http_error_405(self, req, fp, code, msg, headers):
1327                                 fp.read()
1328                                 fp.close()
1329
1330                                 newheaders = dict((k,v) for k,v in req.headers.items()
1331                                                                   if k.lower() not in ("content-length", "content-type"))
1332                                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1333                                                                                                  headers=newheaders,
1334                                                                                                  origin_req_host=req.get_origin_req_host(),
1335                                                                                                  unverifiable=True))
1336
1337                 # Build our opener
1338                 opener = compat_urllib_request.OpenerDirector()
1339                 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340                                                 HTTPMethodFallback, HEADRedirectHandler,
1341                                                 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342                         opener.add_handler(handler())
1343
1344                 response = opener.open(HeadRequest(url))
1345                 new_url = response.geturl()
1346
1347                 if url == new_url:
1348                         return False
1349
1350                 self.report_following_redirect(new_url)
1351                 self._downloader.download([new_url])
1352                 return True
1353
1354         def _real_extract(self, url):
1355                 if self._test_redirect(url): return
1356
1357                 video_id = url.split('/')[-1]
1358                 request = compat_urllib_request.Request(url)
1359                 try:
1360                         self.report_download_webpage(video_id)
1361                         webpage = compat_urllib_request.urlopen(request).read()
1362                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1363                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1364                         return
1365                 except ValueError as err:
1366                         # since this is the last-resort InfoExtractor, if
1367                         # this error is thrown, it'll be thrown here
1368                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1369                         return
1370
1371                 self.report_extraction(video_id)
1372                 # Start with something easy: JW Player in SWFObject
1373                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1374                 if mobj is None:
1375                         # Broaden the search a little bit
1376                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1377                 if mobj is None:
1378                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379                         return
1380
1381                 # It's possible that one of the regexes
1382                 # matched, but returned an empty group:
1383                 if mobj.group(1) is None:
1384                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385                         return
1386
1387                 video_url = compat_urllib_parse.unquote(mobj.group(1))
1388                 video_id = os.path.basename(video_url)
1389
1390                 # here's a fun little line of code for you:
1391                 video_extension = os.path.splitext(video_id)[1][1:]
1392                 video_id = os.path.splitext(video_id)[0]
1393
1394                 # it's tempting to parse this further, but you would
1395                 # have to take into account all the variations like
1396                 #   Video Title - Site Name
1397                 #   Site Name | Video Title
1398                 #   Video Title - Tagline | Site Name
1399                 # and so on and so forth; it's just not practical
1400                 mobj = re.search(r'<title>(.*)</title>', webpage)
1401                 if mobj is None:
1402                         self._downloader.trouble(u'ERROR: unable to extract title')
1403                         return
1404                 video_title = mobj.group(1).decode('utf-8')
1405
1406                 # video uploader is domain name
1407                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408                 if mobj is None:
1409                         self._downloader.trouble(u'ERROR: unable to extract title')
1410                         return
1411                 video_uploader = mobj.group(1).decode('utf-8')
1412
1413                 return [{
1414                         'id':           video_id.decode('utf-8'),
1415                         'url':          video_url.decode('utf-8'),
1416                         'uploader':     video_uploader,
1417                         'upload_date':  None,
1418                         'title':        video_title,
1419                         'ext':          video_extension.decode('utf-8'),
1420                 }]
1421
1422
1423 class YoutubeSearchIE(InfoExtractor):
1424         """Information Extractor for YouTube search queries."""
1425         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427         _max_youtube_results = 1000
1428         IE_NAME = u'youtube:search'
1429
1430         def __init__(self, downloader=None):
1431                 InfoExtractor.__init__(self, downloader)
1432
1433         def report_download_page(self, query, pagenum):
1434                 """Report attempt to download search page with given number."""
1435                 query = query.decode(preferredencoding())
1436                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1437
1438         def _real_extract(self, query):
1439                 mobj = re.match(self._VALID_URL, query)
1440                 if mobj is None:
1441                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1442                         return
1443
1444                 prefix, query = query.split(':')
1445                 prefix = prefix[8:]
1446                 query = query.encode('utf-8')
1447                 if prefix == '':
1448                         self._download_n_results(query, 1)
1449                         return
1450                 elif prefix == 'all':
1451                         self._download_n_results(query, self._max_youtube_results)
1452                         return
1453                 else:
1454                         try:
1455                                 n = int(prefix)
1456                                 if n <= 0:
1457                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1458                                         return
1459                                 elif n > self._max_youtube_results:
1460                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1461                                         n = self._max_youtube_results
1462                                 self._download_n_results(query, n)
1463                                 return
1464                         except ValueError: # parsing prefix as integer fails
1465                                 self._download_n_results(query, 1)
1466                                 return
1467
1468         def _download_n_results(self, query, n):
1469                 """Downloads a specified number of results for a query"""
1470
1471                 video_ids = []
1472                 pagenum = 0
1473                 limit = n
1474
1475                 while (50 * pagenum) < limit:
1476                         self.report_download_page(query, pagenum+1)
1477                         result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1478                         request = compat_urllib_request.Request(result_url)
1479                         try:
1480                                 data = compat_urllib_request.urlopen(request).read()
1481                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1482                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1483                                 return
1484                         api_response = json.loads(data)['data']
1485
1486                         new_ids = list(video['id'] for video in api_response['items'])
1487                         video_ids += new_ids
1488
1489                         limit = min(n, api_response['totalItems'])
1490                         pagenum += 1
1491
1492                 if len(video_ids) > n:
1493                         video_ids = video_ids[:n]
1494                 for id in video_ids:
1495                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1496                 return
1497
1498
1499 class GoogleSearchIE(InfoExtractor):
1500         """Information Extractor for Google Video search queries."""
1501         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1502         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1503         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1504         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1505         _max_google_results = 1000
1506         IE_NAME = u'video.google:search'
1507
1508         def __init__(self, downloader=None):
1509                 InfoExtractor.__init__(self, downloader)
1510
1511         def report_download_page(self, query, pagenum):
1512                 """Report attempt to download playlist page with given number."""
1513                 query = query.decode(preferredencoding())
1514                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1515
1516         def _real_extract(self, query):
1517                 mobj = re.match(self._VALID_URL, query)
1518                 if mobj is None:
1519                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1520                         return
1521
1522                 prefix, query = query.split(':')
1523                 prefix = prefix[8:]
1524                 query = query.encode('utf-8')
1525                 if prefix == '':
1526                         self._download_n_results(query, 1)
1527                         return
1528                 elif prefix == 'all':
1529                         self._download_n_results(query, self._max_google_results)
1530                         return
1531                 else:
1532                         try:
1533                                 n = int(prefix)
1534                                 if n <= 0:
1535                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1536                                         return
1537                                 elif n > self._max_google_results:
1538                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1539                                         n = self._max_google_results
1540                                 self._download_n_results(query, n)
1541                                 return
1542                         except ValueError: # parsing prefix as integer fails
1543                                 self._download_n_results(query, 1)
1544                                 return
1545
1546         def _download_n_results(self, query, n):
1547                 """Downloads a specified number of results for a query"""
1548
1549                 video_ids = []
1550                 pagenum = 0
1551
1552                 while True:
1553                         self.report_download_page(query, pagenum)
1554                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1555                         request = compat_urllib_request.Request(result_url)
1556                         try:
1557                                 page = compat_urllib_request.urlopen(request).read()
1558                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1559                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1560                                 return
1561
1562                         # Extract video identifiers
1563                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1564                                 video_id = mobj.group(1)
1565                                 if video_id not in video_ids:
1566                                         video_ids.append(video_id)
1567                                         if len(video_ids) == n:
1568                                                 # Specified n videos reached
1569                                                 for id in video_ids:
1570                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571                                                 return
1572
1573                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1574                                 for id in video_ids:
1575                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1576                                 return
1577
1578                         pagenum = pagenum + 1
1579
1580
1581 class YahooSearchIE(InfoExtractor):
1582         """Information Extractor for Yahoo! Video search queries."""
1583         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1584         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1585         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1586         _MORE_PAGES_INDICATOR = r'\s*Next'
1587         _max_yahoo_results = 1000
1588         IE_NAME = u'video.yahoo:search'
1589
1590         def __init__(self, downloader=None):
1591                 InfoExtractor.__init__(self, downloader)
1592
1593         def report_download_page(self, query, pagenum):
1594                 """Report attempt to download playlist page with given number."""
1595                 query = query.decode(preferredencoding())
1596                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1597
1598         def _real_extract(self, query):
1599                 mobj = re.match(self._VALID_URL, query)
1600                 if mobj is None:
1601                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1602                         return
1603
1604                 prefix, query = query.split(':')
1605                 prefix = prefix[8:]
1606                 query = query.encode('utf-8')
1607                 if prefix == '':
1608                         self._download_n_results(query, 1)
1609                         return
1610                 elif prefix == 'all':
1611                         self._download_n_results(query, self._max_yahoo_results)
1612                         return
1613                 else:
1614                         try:
1615                                 n = int(prefix)
1616                                 if n <= 0:
1617                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1618                                         return
1619                                 elif n > self._max_yahoo_results:
1620                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621                                         n = self._max_yahoo_results
1622                                 self._download_n_results(query, n)
1623                                 return
1624                         except ValueError: # parsing prefix as integer fails
1625                                 self._download_n_results(query, 1)
1626                                 return
1627
1628         def _download_n_results(self, query, n):
1629                 """Downloads a specified number of results for a query"""
1630
1631                 video_ids = []
1632                 already_seen = set()
1633                 pagenum = 1
1634
1635                 while True:
1636                         self.report_download_page(query, pagenum)
1637                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638                         request = compat_urllib_request.Request(result_url)
1639                         try:
1640                                 page = compat_urllib_request.urlopen(request).read()
1641                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1643                                 return
1644
1645                         # Extract video identifiers
1646                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647                                 video_id = mobj.group(1)
1648                                 if video_id not in already_seen:
1649                                         video_ids.append(video_id)
1650                                         already_seen.add(video_id)
1651                                         if len(video_ids) == n:
1652                                                 # Specified n videos reached
1653                                                 for id in video_ids:
1654                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655                                                 return
1656
1657                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658                                 for id in video_ids:
1659                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                                 return
1661
1662                         pagenum = pagenum + 1
1663
1664
1665 class YoutubePlaylistIE(InfoExtractor):
1666         """Information Extractor for YouTube playlists."""
1667
1668         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1669         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1670         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1671         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1672         IE_NAME = u'youtube:playlist'
1673
1674         def __init__(self, downloader=None):
1675                 InfoExtractor.__init__(self, downloader)
1676
1677         def report_download_page(self, playlist_id, pagenum):
1678                 """Report attempt to download playlist page with given number."""
1679                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1680
1681         def _real_extract(self, url):
1682                 # Extract playlist id
1683                 mobj = re.match(self._VALID_URL, url)
1684                 if mobj is None:
1685                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1686                         return
1687
1688                 # Single video case
1689                 if mobj.group(3) is not None:
1690                         self._downloader.download([mobj.group(3)])
1691                         return
1692
1693                 # Download playlist pages
1694                 # prefix is 'p' as default for playlists but there are other types that need extra care
1695                 playlist_prefix = mobj.group(1)
1696                 if playlist_prefix == 'a':
1697                         playlist_access = 'artist'
1698                 else:
1699                         playlist_prefix = 'p'
1700                         playlist_access = 'view_play_list'
1701                 playlist_id = mobj.group(2)
1702                 video_ids = []
1703                 pagenum = 1
1704
1705                 while True:
1706                         self.report_download_page(playlist_id, pagenum)
1707                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1708                         request = compat_urllib_request.Request(url)
1709                         try:
1710                                 page = compat_urllib_request.urlopen(request).read()
1711                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1713                                 return
1714
1715                         # Extract video identifiers
1716                         ids_in_page = []
1717                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1718                                 if mobj.group(1) not in ids_in_page:
1719                                         ids_in_page.append(mobj.group(1))
1720                         video_ids.extend(ids_in_page)
1721
1722                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1723                                 break
1724                         pagenum = pagenum + 1
1725
1726                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1727                 playlistend = self._downloader.params.get('playlistend', -1)
1728                 if playlistend == -1:
1729                         video_ids = video_ids[playliststart:]
1730                 else:
1731                         video_ids = video_ids[playliststart:playlistend]
1732
1733                 for id in video_ids:
1734                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1735                 return
1736
1737
1738 class YoutubeChannelIE(InfoExtractor):
1739         """Information Extractor for YouTube channels."""
1740
1741         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1742         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1743         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1744         IE_NAME = u'youtube:channel'
1745
1746         def report_download_page(self, channel_id, pagenum):
1747                 """Report attempt to download channel page with given number."""
1748                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1749
1750         def _real_extract(self, url):
1751                 # Extract channel id
1752                 mobj = re.match(self._VALID_URL, url)
1753                 if mobj is None:
1754                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1755                         return
1756
1757                 # Download channel pages
1758                 channel_id = mobj.group(1)
1759                 video_ids = []
1760                 pagenum = 1
1761
1762                 while True:
1763                         self.report_download_page(channel_id, pagenum)
1764                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1765                         request = compat_urllib_request.Request(url)
1766                         try:
1767                                 page = compat_urllib_request.urlopen(request).read()
1768                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1770                                 return
1771
1772                         # Extract video identifiers
1773                         ids_in_page = []
1774                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1775                                 if mobj.group(1) not in ids_in_page:
1776                                         ids_in_page.append(mobj.group(1))
1777                         video_ids.extend(ids_in_page)
1778
1779                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1780                                 break
1781                         pagenum = pagenum + 1
1782
1783                 for id in video_ids:
1784                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1785                 return
1786
1787
1788 class YoutubeUserIE(InfoExtractor):
1789         """Information Extractor for YouTube users."""
1790
1791         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1792         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1793         _GDATA_PAGE_SIZE = 50
1794         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1795         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1796         IE_NAME = u'youtube:user'
1797
1798         def __init__(self, downloader=None):
1799                 InfoExtractor.__init__(self, downloader)
1800
1801         def report_download_page(self, username, start_index):
1802                 """Report attempt to download user page."""
1803                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1804                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1805
1806         def _real_extract(self, url):
1807                 # Extract username
1808                 mobj = re.match(self._VALID_URL, url)
1809                 if mobj is None:
1810                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1811                         return
1812
1813                 username = mobj.group(1)
1814
1815                 # Download video ids using YouTube Data API. Result size per
1816                 # query is limited (currently to 50 videos) so we need to query
1817                 # page by page until there are no video ids - it means we got
1818                 # all of them.
1819
1820                 video_ids = []
1821                 pagenum = 0
1822
1823                 while True:
1824                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1825                         self.report_download_page(username, start_index)
1826
1827                         request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1828
1829                         try:
1830                                 page = compat_urllib_request.urlopen(request).read()
1831                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1832                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1833                                 return
1834
1835                         # Extract video identifiers
1836                         ids_in_page = []
1837
1838                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1839                                 if mobj.group(1) not in ids_in_page:
1840                                         ids_in_page.append(mobj.group(1))
1841
1842                         video_ids.extend(ids_in_page)
1843
1844                         # A little optimization - if current page is not
1845                         # "full", ie. does not contain PAGE_SIZE video ids then
1846                         # we can assume that this page is the last one - there
1847                         # are no more ids on further pages - no need to query
1848                         # again.
1849
1850                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1851                                 break
1852
1853                         pagenum += 1
1854
1855                 all_ids_count = len(video_ids)
1856                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1857                 playlistend = self._downloader.params.get('playlistend', -1)
1858
1859                 if playlistend == -1:
1860                         video_ids = video_ids[playliststart:]
1861                 else:
1862                         video_ids = video_ids[playliststart:playlistend]
1863
1864                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1865                                 (username, all_ids_count, len(video_ids)))
1866
1867                 for video_id in video_ids:
1868                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1869
1870
1871 class BlipTVUserIE(InfoExtractor):
1872         """Information Extractor for blip.tv users."""
1873
1874         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1875         _PAGE_SIZE = 12
1876         IE_NAME = u'blip.tv:user'
1877
1878         def __init__(self, downloader=None):
1879                 InfoExtractor.__init__(self, downloader)
1880
1881         def report_download_page(self, username, pagenum):
1882                 """Report attempt to download user page."""
1883                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1884                                 (self.IE_NAME, username, pagenum))
1885
1886         def _real_extract(self, url):
1887                 # Extract username
1888                 mobj = re.match(self._VALID_URL, url)
1889                 if mobj is None:
1890                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1891                         return
1892
1893                 username = mobj.group(1)
1894
1895                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1896
1897                 request = compat_urllib_request.Request(url)
1898
1899                 try:
1900                         page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1901                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1902                         page_base = page_base % mobj.group(1)
1903                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1904                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1905                         return
1906
1907
1908                 # Download video ids using BlipTV Ajax calls. Result size per
1909                 # query is limited (currently to 12 videos) so we need to query
1910                 # page by page until there are no video ids - it means we got
1911                 # all of them.
1912
1913                 video_ids = []
1914                 pagenum = 1
1915
1916                 while True:
1917                         self.report_download_page(username, pagenum)
1918
1919                         request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1920
1921                         try:
1922                                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1923                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1925                                 return
1926
1927                         # Extract video identifiers
1928                         ids_in_page = []
1929
1930                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1931                                 if mobj.group(1) not in ids_in_page:
1932                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1933
1934                         video_ids.extend(ids_in_page)
1935
1936                         # A little optimization - if current page is not
1937                         # "full", ie. does not contain PAGE_SIZE video ids then
1938                         # we can assume that this page is the last one - there
1939                         # are no more ids on further pages - no need to query
1940                         # again.
1941
1942                         if len(ids_in_page) < self._PAGE_SIZE:
1943                                 break
1944
1945                         pagenum += 1
1946
1947                 all_ids_count = len(video_ids)
1948                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1949                 playlistend = self._downloader.params.get('playlistend', -1)
1950
1951                 if playlistend == -1:
1952                         video_ids = video_ids[playliststart:]
1953                 else:
1954                         video_ids = video_ids[playliststart:playlistend]
1955
1956                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1957                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1958
1959                 for video_id in video_ids:
1960                         self._downloader.download([u'http://blip.tv/'+video_id])
1961
1962
1963 class DepositFilesIE(InfoExtractor):
1964         """Information extractor for depositfiles.com"""
1965
1966         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1967         IE_NAME = u'DepositFiles'
1968
1969         def __init__(self, downloader=None):
1970                 InfoExtractor.__init__(self, downloader)
1971
1972         def report_download_webpage(self, file_id):
1973                 """Report webpage download."""
1974                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1975
1976         def report_extraction(self, file_id):
1977                 """Report information extraction."""
1978                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1979
1980         def _real_extract(self, url):
1981                 file_id = url.split('/')[-1]
1982                 # Rebuild url in english locale
1983                 url = 'http://depositfiles.com/en/files/' + file_id
1984
1985                 # Retrieve file webpage with 'Free download' button pressed
1986                 free_download_indication = { 'gateway_result' : '1' }
1987                 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1988                 try:
1989                         self.report_download_webpage(file_id)
1990                         webpage = compat_urllib_request.urlopen(request).read()
1991                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1992                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1993                         return
1994
1995                 # Search for the real file URL
1996                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1997                 if (mobj is None) or (mobj.group(1) is None):
1998                         # Try to figure out reason of the error.
1999                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2000                         if (mobj is not None) and (mobj.group(1) is not None):
2001                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2002                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2003                         else:
2004                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2005                         return
2006
2007                 file_url = mobj.group(1)
2008                 file_extension = os.path.splitext(file_url)[1][1:]
2009
2010                 # Search for file title
2011                 mobj = re.search(r'<b title="(.*?)">', webpage)
2012                 if mobj is None:
2013                         self._downloader.trouble(u'ERROR: unable to extract title')
2014                         return
2015                 file_title = mobj.group(1).decode('utf-8')
2016
2017                 return [{
2018                         'id':           file_id.decode('utf-8'),
2019                         'url':          file_url.decode('utf-8'),
2020                         'uploader':     None,
2021                         'upload_date':  None,
2022                         'title':        file_title,
2023                         'ext':          file_extension.decode('utf-8'),
2024                 }]
2025
2026
2027 class FacebookIE(InfoExtractor):
2028         """Information Extractor for Facebook"""
2029
2030         _WORKING = False
2031         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2032         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2033         _NETRC_MACHINE = 'facebook'
2034         _available_formats = ['video', 'highqual', 'lowqual']
2035         _video_extensions = {
2036                 'video': 'mp4',
2037                 'highqual': 'mp4',
2038                 'lowqual': 'mp4',
2039         }
2040         IE_NAME = u'facebook'
2041
2042         def __init__(self, downloader=None):
2043                 InfoExtractor.__init__(self, downloader)
2044
2045         def _reporter(self, message):
2046                 """Add header and report message."""
2047                 self._downloader.to_screen(u'[facebook] %s' % message)
2048
2049         def report_login(self):
2050                 """Report attempt to log in."""
2051                 self._reporter(u'Logging in')
2052
2053         def report_video_webpage_download(self, video_id):
2054                 """Report attempt to download video webpage."""
2055                 self._reporter(u'%s: Downloading video webpage' % video_id)
2056
2057         def report_information_extraction(self, video_id):
2058                 """Report attempt to extract video information."""
2059                 self._reporter(u'%s: Extracting video information' % video_id)
2060
2061         def _parse_page(self, video_webpage):
2062                 """Extract video information from page"""
2063                 # General data
2064                 data = {'title': r'\("video_title", "(.*?)"\)',
2065                         'description': r'<div class="datawrap">(.*?)</div>',
2066                         'owner': r'\("video_owner_name", "(.*?)"\)',
2067                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2068                         }
2069                 video_info = {}
2070                 for piece in data.keys():
2071                         mobj = re.search(data[piece], video_webpage)
2072                         if mobj is not None:
2073                                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2074
2075                 # Video urls
2076                 video_urls = {}
2077                 for fmt in self._available_formats:
2078                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2079                         if mobj is not None:
2080                                 # URL is in a Javascript segment inside an escaped Unicode format within
2081                                 # the generally utf-8 page
2082                                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2083                 video_info['video_urls'] = video_urls
2084
2085                 return video_info
2086
2087         def _real_initialize(self):
2088                 if self._downloader is None:
2089                         return
2090
2091                 useremail = None
2092                 password = None
2093                 downloader_params = self._downloader.params
2094
2095                 # Attempt to use provided username and password or .netrc data
2096                 if downloader_params.get('username', None) is not None:
2097                         useremail = downloader_params['username']
2098                         password = downloader_params['password']
2099                 elif downloader_params.get('usenetrc', False):
2100                         try:
2101                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2102                                 if info is not None:
2103                                         useremail = info[0]
2104                                         password = info[2]
2105                                 else:
2106                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2107                         except (IOError, netrc.NetrcParseError) as err:
2108                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2109                                 return
2110
2111                 if useremail is None:
2112                         return
2113
2114                 # Log in
2115                 login_form = {
2116                         'email': useremail,
2117                         'pass': password,
2118                         'login': 'Log+In'
2119                         }
2120                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2121                 try:
2122                         self.report_login()
2123                         login_results = compat_urllib_request.urlopen(request).read()
2124                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2125                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2126                                 return
2127                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2128                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2129                         return
2130
2131         def _real_extract(self, url):
2132                 mobj = re.match(self._VALID_URL, url)
2133                 if mobj is None:
2134                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2135                         return
2136                 video_id = mobj.group('ID')
2137
2138                 # Get video webpage
2139                 self.report_video_webpage_download(video_id)
2140                 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2141                 try:
2142                         page = compat_urllib_request.urlopen(request)
2143                         video_webpage = page.read()
2144                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2145                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2146                         return
2147
2148                 # Start extracting information
2149                 self.report_information_extraction(video_id)
2150
2151                 # Extract information
2152                 video_info = self._parse_page(video_webpage)
2153
2154                 # uploader
2155                 if 'owner' not in video_info:
2156                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2157                         return
2158                 video_uploader = video_info['owner']
2159
2160                 # title
2161                 if 'title' not in video_info:
2162                         self._downloader.trouble(u'ERROR: unable to extract video title')
2163                         return
2164                 video_title = video_info['title']
2165                 video_title = video_title.decode('utf-8')
2166
2167                 # thumbnail image
2168                 if 'thumbnail' not in video_info:
2169                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2170                         video_thumbnail = ''
2171                 else:
2172                         video_thumbnail = video_info['thumbnail']
2173
2174                 # upload date
2175                 upload_date = None
2176                 if 'upload_date' in video_info:
2177                         upload_time = video_info['upload_date']
2178                         timetuple = email.utils.parsedate_tz(upload_time)
2179                         if timetuple is not None:
2180                                 try:
2181                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2182                                 except:
2183                                         pass
2184
2185                 # description
2186                 video_description = video_info.get('description', 'No description available.')
2187
2188                 url_map = video_info['video_urls']
2189                 if len(url_map.keys()) > 0:
2190                         # Decide which formats to download
2191                         req_format = self._downloader.params.get('format', None)
2192                         format_limit = self._downloader.params.get('format_limit', None)
2193
2194                         if format_limit is not None and format_limit in self._available_formats:
2195                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2196                         else:
2197                                 format_list = self._available_formats
2198                         existing_formats = [x for x in format_list if x in url_map]
2199                         if len(existing_formats) == 0:
2200                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2201                                 return
2202                         if req_format is None:
2203                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2204                         elif req_format == 'worst':
2205                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2206                         elif req_format == '-1':
2207                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2208                         else:
2209                                 # Specific format
2210                                 if req_format not in url_map:
2211                                         self._downloader.trouble(u'ERROR: requested format not available')
2212                                         return
2213                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2214
2215                 results = []
2216                 for format_param, video_real_url in video_url_list:
2217                         # Extension
2218                         video_extension = self._video_extensions.get(format_param, 'mp4')
2219
2220                         results.append({
2221                                 'id':           video_id.decode('utf-8'),
2222                                 'url':          video_real_url.decode('utf-8'),
2223                                 'uploader':     video_uploader.decode('utf-8'),
2224                                 'upload_date':  upload_date,
2225                                 'title':        video_title,
2226                                 'ext':          video_extension.decode('utf-8'),
2227                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2228                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2229                                 'description':  video_description.decode('utf-8'),
2230                         })
2231                 return results
2232
2233 class BlipTVIE(InfoExtractor):
2234         """Information extractor for blip.tv"""
2235
2236         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2237         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2238         IE_NAME = u'blip.tv'
2239
2240         def report_extraction(self, file_id):
2241                 """Report information extraction."""
2242                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2243
2244         def report_direct_download(self, title):
2245                 """Report information extraction."""
2246                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2247
2248         def _real_extract(self, url):
2249                 mobj = re.match(self._VALID_URL, url)
2250                 if mobj is None:
2251                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2252                         return
2253
2254                 if '?' in url:
2255                         cchar = '&'
2256                 else:
2257                         cchar = '?'
2258                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2259                 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2260                 self.report_extraction(mobj.group(1))
2261                 info = None
2262                 try:
2263                         urlh = compat_urllib_request.urlopen(request)
2264                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2265                                 basename = url.split('/')[-1]
2266                                 title,ext = os.path.splitext(basename)
2267                                 title = title.decode('UTF-8')
2268                                 ext = ext.replace('.', '')
2269                                 self.report_direct_download(title)
2270                                 info = {
2271                                         'id': title,
2272                                         'url': url,
2273                                         'uploader': None,
2274                                         'upload_date': None,
2275                                         'title': title,
2276                                         'ext': ext,
2277                                         'urlhandle': urlh
2278                                 }
2279                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2280                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2281                         return
2282                 if info is None: # Regular URL
2283                         try:
2284                                 json_code = urlh.read()
2285                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2286                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2287                                 return
2288
2289                         try:
2290                                 json_data = json.loads(json_code)
2291                                 if 'Post' in json_data:
2292                                         data = json_data['Post']
2293                                 else:
2294                                         data = json_data
2295
2296                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2297                                 video_url = data['media']['url']
2298                                 umobj = re.match(self._URL_EXT, video_url)
2299                                 if umobj is None:
2300                                         raise ValueError('Can not determine filename extension')
2301                                 ext = umobj.group(1)
2302
2303                                 info = {
2304                                         'id': data['item_id'],
2305                                         'url': video_url,
2306                                         'uploader': data['display_name'],
2307                                         'upload_date': upload_date,
2308                                         'title': data['title'],
2309                                         'ext': ext,
2310                                         'format': data['media']['mimeType'],
2311                                         'thumbnail': data['thumbnailUrl'],
2312                                         'description': data['description'],
2313                                         'player_url': data['embedUrl']
2314                                 }
2315                         except (ValueError,KeyError) as err:
2316                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2317                                 return
2318
2319                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2320                 return [info]
2321
2322
2323 class MyVideoIE(InfoExtractor):
2324         """Information Extractor for myvideo.de."""
2325
2326         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2327         IE_NAME = u'myvideo'
2328
2329         def __init__(self, downloader=None):
2330                 InfoExtractor.__init__(self, downloader)
2331
2332         def report_download_webpage(self, video_id):
2333                 """Report webpage download."""
2334                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2335
2336         def report_extraction(self, video_id):
2337                 """Report information extraction."""
2338                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2339
2340         def _real_extract(self,url):
2341                 mobj = re.match(self._VALID_URL, url)
2342                 if mobj is None:
2343                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2344                         return
2345
2346                 video_id = mobj.group(1)
2347
2348                 # Get video webpage
2349                 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2350                 try:
2351                         self.report_download_webpage(video_id)
2352                         webpage = compat_urllib_request.urlopen(request).read()
2353                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2355                         return
2356
2357                 self.report_extraction(video_id)
2358                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2359                                  webpage)
2360                 if mobj is None:
2361                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2362                         return
2363                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2364
2365                 mobj = re.search('<title>([^<]+)</title>', webpage)
2366                 if mobj is None:
2367                         self._downloader.trouble(u'ERROR: unable to extract title')
2368                         return
2369
2370                 video_title = mobj.group(1)
2371
2372                 return [{
2373                         'id':           video_id,
2374                         'url':          video_url,
2375                         'uploader':     None,
2376                         'upload_date':  None,
2377                         'title':        video_title,
2378                         'ext':          u'flv',
2379                 }]
2380
2381 class ComedyCentralIE(InfoExtractor):
2382         """Information extractor for The Daily Show and Colbert Report """
2383
2384         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2385         IE_NAME = u'comedycentral'
2386
2387         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2388
2389         _video_extensions = {
2390                 '3500': 'mp4',
2391                 '2200': 'mp4',
2392                 '1700': 'mp4',
2393                 '1200': 'mp4',
2394                 '750': 'mp4',
2395                 '400': 'mp4',
2396         }
2397         _video_dimensions = {
2398                 '3500': '1280x720',
2399                 '2200': '960x540',
2400                 '1700': '768x432',
2401                 '1200': '640x360',
2402                 '750': '512x288',
2403                 '400': '384x216',
2404         }
2405
2406         def report_extraction(self, episode_id):
2407                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2408
2409         def report_config_download(self, episode_id):
2410                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2411
2412         def report_index_download(self, episode_id):
2413                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2414
2415         def report_player_url(self, episode_id):
2416                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2417
2418
2419         def _print_formats(self, formats):
2420                 print('Available formats:')
2421                 for x in formats:
2422                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2423
2424
2425         def _real_extract(self, url):
2426                 mobj = re.match(self._VALID_URL, url)
2427                 if mobj is None:
2428                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2429                         return
2430
2431                 if mobj.group('shortname'):
2432                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2433                                 url = u'http://www.thedailyshow.com/full-episodes/'
2434                         else:
2435                                 url = u'http://www.colbertnation.com/full-episodes/'
2436                         mobj = re.match(self._VALID_URL, url)
2437                         assert mobj is not None
2438
2439                 dlNewest = not mobj.group('episode')
2440                 if dlNewest:
2441                         epTitle = mobj.group('showname')
2442                 else:
2443                         epTitle = mobj.group('episode')
2444
2445                 req = compat_urllib_request.Request(url)
2446                 self.report_extraction(epTitle)
2447                 try:
2448                         htmlHandle = compat_urllib_request.urlopen(req)
2449                         html = htmlHandle.read()
2450                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2452                         return
2453                 if dlNewest:
2454                         url = htmlHandle.geturl()
2455                         mobj = re.match(self._VALID_URL, url)
2456                         if mobj is None:
2457                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2458                                 return
2459                         if mobj.group('episode') == '':
2460                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2461                                 return
2462                         epTitle = mobj.group('episode')
2463
2464                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2465
2466                 if len(mMovieParams) == 0:
2467                         # The Colbert Report embeds the information in a without
2468                         # a URL prefix; so extract the alternate reference
2469                         # and then add the URL prefix manually.
2470
2471                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2472                         if len(altMovieParams) == 0:
2473                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2474                                 return
2475                         else:
2476                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2477
2478                 playerUrl_raw = mMovieParams[0][0]
2479                 self.report_player_url(epTitle)
2480                 try:
2481                         urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2482                         playerUrl = urlHandle.geturl()
2483                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2485                         return
2486
2487                 uri = mMovieParams[0][1]
2488                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2489                 self.report_index_download(epTitle)
2490                 try:
2491                         indexXml = compat_urllib_request.urlopen(indexUrl).read()
2492                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2493                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2494                         return
2495
2496                 results = []
2497
2498                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2499                 itemEls = idoc.findall('.//item')
2500                 for itemEl in itemEls:
2501                         mediaId = itemEl.findall('./guid')[0].text
2502                         shortMediaId = mediaId.split(':')[-1]
2503                         showId = mediaId.split(':')[-2].replace('.com', '')
2504                         officialTitle = itemEl.findall('./title')[0].text
2505                         officialDate = itemEl.findall('./pubDate')[0].text
2506
2507                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2508                                                 compat_urllib_parse.urlencode({'uri': mediaId}))
2509                         configReq = compat_urllib_request.Request(configUrl)
2510                         self.report_config_download(epTitle)
2511                         try:
2512                                 configXml = compat_urllib_request.urlopen(configReq).read()
2513                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2514                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2515                                 return
2516
2517                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2518                         turls = []
2519                         for rendition in cdoc.findall('.//rendition'):
2520                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2521                                 turls.append(finfo)
2522
2523                         if len(turls) == 0:
2524                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2525                                 continue
2526
2527                         if self._downloader.params.get('listformats', None):
2528                                 self._print_formats([i[0] for i in turls])
2529                                 return
2530
2531                         # For now, just pick the highest bitrate
2532                         format,video_url = turls[-1]
2533
2534                         # Get the format arg from the arg stream
2535                         req_format = self._downloader.params.get('format', None)
2536
2537                         # Select format if we can find one
2538                         for f,v in turls:
2539                                 if f == req_format:
2540                                         format, video_url = f, v
2541                                         break
2542
2543                         # Patch to download from alternative CDN, which does not
2544                         # break on current RTMPDump builds
2545                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2546                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2547
2548                         if video_url.startswith(broken_cdn):
2549                                 video_url = video_url.replace(broken_cdn, better_cdn)
2550
2551                         effTitle = showId + u'-' + epTitle
2552                         info = {
2553                                 'id': shortMediaId,
2554                                 'url': video_url,
2555                                 'uploader': showId,
2556                                 'upload_date': officialDate,
2557                                 'title': effTitle,
2558                                 'ext': 'mp4',
2559                                 'format': format,
2560                                 'thumbnail': None,
2561                                 'description': officialTitle,
2562                                 'player_url': None #playerUrl
2563                         }
2564
2565                         results.append(info)
2566
2567                 return results
2568
2569
2570 class EscapistIE(InfoExtractor):
2571         """Information extractor for The Escapist """
2572
2573         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2574         IE_NAME = u'escapist'
2575
2576         def report_extraction(self, showName):
2577                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2578
2579         def report_config_download(self, showName):
2580                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2581
2582         def _real_extract(self, url):
2583                 mobj = re.match(self._VALID_URL, url)
2584                 if mobj is None:
2585                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2586                         return
2587                 showName = mobj.group('showname')
2588                 videoId = mobj.group('episode')
2589
2590                 self.report_extraction(showName)
2591                 try:
2592                         webPage = compat_urllib_request.urlopen(url)
2593                         webPageBytes = webPage.read()
2594                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2595                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2596                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2598                         return
2599
2600                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2601                 description = unescapeHTML(descMatch.group(1))
2602                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2603                 imgUrl = unescapeHTML(imgMatch.group(1))
2604                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2605                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2606                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2607                 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2608
2609                 self.report_config_download(showName)
2610                 try:
2611                         configJSON = compat_urllib_request.urlopen(configUrl).read()
2612                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2613                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2614                         return
2615
2616                 # Technically, it's JavaScript, not JSON
2617                 configJSON = configJSON.replace("'", '"')
2618
2619                 try:
2620                         config = json.loads(configJSON)
2621                 except (ValueError,) as err:
2622                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2623                         return
2624
2625                 playlist = config['playlist']
2626                 videoUrl = playlist[1]['url']
2627
2628                 info = {
2629                         'id': videoId,
2630                         'url': videoUrl,
2631                         'uploader': showName,
2632                         'upload_date': None,
2633                         'title': showName,
2634                         'ext': 'flv',
2635                         'thumbnail': imgUrl,
2636                         'description': description,
2637                         'player_url': playerUrl,
2638                 }
2639
2640                 return [info]
2641
2642
2643 class CollegeHumorIE(InfoExtractor):
2644         """Information extractor for collegehumor.com"""
2645
2646         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2647         IE_NAME = u'collegehumor'
2648
2649         def report_webpage(self, video_id):
2650                 """Report information extraction."""
2651                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2652
2653         def report_extraction(self, video_id):
2654                 """Report information extraction."""
2655                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2656
2657         def _real_extract(self, url):
2658                 mobj = re.match(self._VALID_URL, url)
2659                 if mobj is None:
2660                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2661                         return
2662                 video_id = mobj.group('videoid')
2663
2664                 self.report_webpage(video_id)
2665                 request = compat_urllib_request.Request(url)
2666                 try:
2667                         webpage = compat_urllib_request.urlopen(request).read()
2668                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2670                         return
2671
2672                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2673                 if m is None:
2674                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2675                         return
2676                 internal_video_id = m.group('internalvideoid')
2677
2678                 info = {
2679                         'id': video_id,
2680                         'internal_id': internal_video_id,
2681                         'uploader': None,
2682                         'upload_date': None,
2683                 }
2684
2685                 self.report_extraction(video_id)
2686                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2687                 try:
2688                         metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2689                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2690                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2691                         return
2692
2693                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2694                 try:
2695                         videoNode = mdoc.findall('./video')[0]
2696                         info['description'] = videoNode.findall('./description')[0].text
2697                         info['title'] = videoNode.findall('./caption')[0].text
2698                         info['url'] = videoNode.findall('./file')[0].text
2699                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2700                         info['ext'] = info['url'].rpartition('.')[2]
2701                 except IndexError:
2702                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2703                         return
2704
2705                 return [info]
2706
2707
2708 class XVideosIE(InfoExtractor):
2709         """Information extractor for xvideos.com"""
2710
2711         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2712         IE_NAME = u'xvideos'
2713
2714         def report_webpage(self, video_id):
2715                 """Report information extraction."""
2716                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2717
2718         def report_extraction(self, video_id):
2719                 """Report information extraction."""
2720                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2721
2722         def _real_extract(self, url):
2723                 mobj = re.match(self._VALID_URL, url)
2724                 if mobj is None:
2725                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2726                         return
2727                 video_id = mobj.group(1).decode('utf-8')
2728
2729                 self.report_webpage(video_id)
2730
2731                 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2732                 try:
2733                         webpage = compat_urllib_request.urlopen(request).read()
2734                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2735                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2736                         return
2737
2738                 self.report_extraction(video_id)
2739
2740
2741                 # Extract video URL
2742                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2743                 if mobj is None:
2744                         self._downloader.trouble(u'ERROR: unable to extract video url')
2745                         return
2746                 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2747
2748
2749                 # Extract title
2750                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2751                 if mobj is None:
2752                         self._downloader.trouble(u'ERROR: unable to extract video title')
2753                         return
2754                 video_title = mobj.group(1).decode('utf-8')
2755
2756
2757                 # Extract video thumbnail
2758                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2759                 if mobj is None:
2760                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2761                         return
2762                 video_thumbnail = mobj.group(0).decode('utf-8')
2763
2764                 info = {
2765                         'id': video_id,
2766                         'url': video_url,
2767                         'uploader': None,
2768                         'upload_date': None,
2769                         'title': video_title,
2770                         'ext': 'flv',
2771                         'thumbnail': video_thumbnail,
2772                         'description': None,
2773                 }
2774
2775                 return [info]
2776
2777
2778 class SoundcloudIE(InfoExtractor):
2779         """Information extractor for soundcloud.com
2780            To access the media, the uid of the song and a stream token
2781            must be extracted from the page source and the script must make
2782            a request to media.soundcloud.com/crossdomain.xml. Then
2783            the media can be grabbed by requesting from an url composed
2784            of the stream token and uid
2785          """
2786
2787         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2788         IE_NAME = u'soundcloud'
2789
2790         def __init__(self, downloader=None):
2791                 InfoExtractor.__init__(self, downloader)
2792
2793         def report_webpage(self, video_id):
2794                 """Report information extraction."""
2795                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2796
2797         def report_extraction(self, video_id):
2798                 """Report information extraction."""
2799                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2800
2801         def _real_extract(self, url):
2802                 mobj = re.match(self._VALID_URL, url)
2803                 if mobj is None:
2804                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805                         return
2806
2807                 # extract uploader (which is in the url)
2808                 uploader = mobj.group(1).decode('utf-8')
2809                 # extract simple title (uploader + slug of song title)
2810                 slug_title =  mobj.group(2).decode('utf-8')
2811                 simple_title = uploader + u'-' + slug_title
2812
2813                 self.report_webpage('%s/%s' % (uploader, slug_title))
2814
2815                 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2816                 try:
2817                         webpage = compat_urllib_request.urlopen(request).read()
2818                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2820                         return
2821
2822                 self.report_extraction('%s/%s' % (uploader, slug_title))
2823
2824                 # extract uid and stream token that soundcloud hands out for access
2825                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2826                 if mobj:
2827                         video_id = mobj.group(1)
2828                         stream_token = mobj.group(2)
2829
2830                 # extract unsimplified title
2831                 mobj = re.search('"title":"(.*?)",', webpage)
2832                 if mobj:
2833                         title = mobj.group(1).decode('utf-8')
2834                 else:
2835                         title = simple_title
2836
2837                 # construct media url (with uid/token)
2838                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2839                 mediaURL = mediaURL % (video_id, stream_token)
2840
2841                 # description
2842                 description = u'No description available'
2843                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2844                 if mobj:
2845                         description = mobj.group(1)
2846
2847                 # upload date
2848                 upload_date = None
2849                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2850                 if mobj:
2851                         try:
2852                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2853                         except Exception as err:
2854                                 self._downloader.to_stderr(compat_str(err))
2855
2856                 # for soundcloud, a request to a cross domain is required for cookies
2857                 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2858
2859                 return [{
2860                         'id':           video_id.decode('utf-8'),
2861                         'url':          mediaURL,
2862                         'uploader':     uploader.decode('utf-8'),
2863                         'upload_date':  upload_date,
2864                         'title':        title,
2865                         'ext':          u'mp3',
2866                         'description': description.decode('utf-8')
2867                 }]
2868
2869
2870 class InfoQIE(InfoExtractor):
2871         """Information extractor for infoq.com"""
2872
2873         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2874         IE_NAME = u'infoq'
2875
2876         def report_webpage(self, video_id):
2877                 """Report information extraction."""
2878                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2879
2880         def report_extraction(self, video_id):
2881                 """Report information extraction."""
2882                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2883
2884         def _real_extract(self, url):
2885                 mobj = re.match(self._VALID_URL, url)
2886                 if mobj is None:
2887                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2888                         return
2889
2890                 self.report_webpage(url)
2891
2892                 request = compat_urllib_request.Request(url)
2893                 try:
2894                         webpage = compat_urllib_request.urlopen(request).read()
2895                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2897                         return
2898
2899                 self.report_extraction(url)
2900
2901
2902                 # Extract video URL
2903                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2904                 if mobj is None:
2905                         self._downloader.trouble(u'ERROR: unable to extract video url')
2906                         return
2907                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2908
2909
2910                 # Extract title
2911                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2912                 if mobj is None:
2913                         self._downloader.trouble(u'ERROR: unable to extract video title')
2914                         return
2915                 video_title = mobj.group(1).decode('utf-8')
2916
2917                 # Extract description
2918                 video_description = u'No description available.'
2919                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2920                 if mobj is not None:
2921                         video_description = mobj.group(1).decode('utf-8')
2922
2923                 video_filename = video_url.split('/')[-1]
2924                 video_id, extension = video_filename.split('.')
2925
2926                 info = {
2927                         'id': video_id,
2928                         'url': video_url,
2929                         'uploader': None,
2930                         'upload_date': None,
2931                         'title': video_title,
2932                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2933                         'thumbnail': None,
2934                         'description': video_description,
2935                 }
2936
2937                 return [info]
2938
2939 class MixcloudIE(InfoExtractor):
2940         """Information extractor for www.mixcloud.com"""
2941         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2942         IE_NAME = u'mixcloud'
2943
2944         def __init__(self, downloader=None):
2945                 InfoExtractor.__init__(self, downloader)
2946
2947         def report_download_json(self, file_id):
2948                 """Report JSON download."""
2949                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2950
2951         def report_extraction(self, file_id):
2952                 """Report information extraction."""
2953                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2954
2955         def get_urls(self, jsonData, fmt, bitrate='best'):
2956                 """Get urls from 'audio_formats' section in json"""
2957                 file_url = None
2958                 try:
2959                         bitrate_list = jsonData[fmt]
2960                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2961                                 bitrate = max(bitrate_list) # select highest
2962
2963                         url_list = jsonData[fmt][bitrate]
2964                 except TypeError: # we have no bitrate info.
2965                         url_list = jsonData[fmt]
2966                 return url_list
2967
2968         def check_urls(self, url_list):
2969                 """Returns 1st active url from list"""
2970                 for url in url_list:
2971                         try:
2972                                 compat_urllib_request.urlopen(url)
2973                                 return url
2974                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2975                                 url = None
2976
2977                 return None
2978
2979         def _print_formats(self, formats):
2980                 print('Available formats:')
2981                 for fmt in formats.keys():
2982                         for b in formats[fmt]:
2983                                 try:
2984                                         ext = formats[fmt][b][0]
2985                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2986                                 except TypeError: # we have no bitrate info
2987                                         ext = formats[fmt][0]
2988                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2989                                         break
2990
2991         def _real_extract(self, url):
2992                 mobj = re.match(self._VALID_URL, url)
2993                 if mobj is None:
2994                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2995                         return
2996                 # extract uploader & filename from url
2997                 uploader = mobj.group(1).decode('utf-8')
2998                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2999
3000                 # construct API request
3001                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3002                 # retrieve .json file with links to files
3003                 request = compat_urllib_request.Request(file_url)
3004                 try:
3005                         self.report_download_json(file_url)
3006                         jsonData = compat_urllib_request.urlopen(request).read()
3007                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3008                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3009                         return
3010
3011                 # parse JSON
3012                 json_data = json.loads(jsonData)
3013                 player_url = json_data['player_swf_url']
3014                 formats = dict(json_data['audio_formats'])
3015
3016                 req_format = self._downloader.params.get('format', None)
3017                 bitrate = None
3018
3019                 if self._downloader.params.get('listformats', None):
3020                         self._print_formats(formats)
3021                         return
3022
3023                 if req_format is None or req_format == 'best':
3024                         for format_param in formats.keys():
3025                                 url_list = self.get_urls(formats, format_param)
3026                                 # check urls
3027                                 file_url = self.check_urls(url_list)
3028                                 if file_url is not None:
3029                                         break # got it!
3030                 else:
3031                         if req_format not in formats.keys():
3032                                 self._downloader.trouble(u'ERROR: format is not available')
3033                                 return
3034
3035                         url_list = self.get_urls(formats, req_format)
3036                         file_url = self.check_urls(url_list)
3037                         format_param = req_format
3038
3039                 return [{
3040                         'id': file_id.decode('utf-8'),
3041                         'url': file_url.decode('utf-8'),
3042                         'uploader':     uploader.decode('utf-8'),
3043                         'upload_date': None,
3044                         'title': json_data['name'],
3045                         'ext': file_url.split('.')[-1].decode('utf-8'),
3046                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3047                         'thumbnail': json_data['thumbnail_url'],
3048                         'description': json_data['description'],
3049                         'player_url': player_url.decode('utf-8'),
3050                 }]
3051
3052 class StanfordOpenClassroomIE(InfoExtractor):
3053         """Information extractor for Stanford's Open ClassRoom"""
3054
3055         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3056         IE_NAME = u'stanfordoc'
3057
3058         def report_download_webpage(self, objid):
3059                 """Report information extraction."""
3060                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3061
3062         def report_extraction(self, video_id):
3063                 """Report information extraction."""
3064                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3065
3066         def _real_extract(self, url):
3067                 mobj = re.match(self._VALID_URL, url)
3068                 if mobj is None:
3069                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3070                         return
3071
3072                 if mobj.group('course') and mobj.group('video'): # A specific video
3073                         course = mobj.group('course')
3074                         video = mobj.group('video')
3075                         info = {
3076                                 'id': course + '_' + video,
3077                                 'uploader': None,
3078                                 'upload_date': None,
3079                         }
3080
3081                         self.report_extraction(info['id'])
3082                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3083                         xmlUrl = baseUrl + video + '.xml'
3084                         try:
3085                                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3086                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3087                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3088                                 return
3089                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3090                         try:
3091                                 info['title'] = mdoc.findall('./title')[0].text
3092                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3093                         except IndexError:
3094                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3095                                 return
3096                         info['ext'] = info['url'].rpartition('.')[2]
3097                         return [info]
3098                 elif mobj.group('course'): # A course page
3099                         course = mobj.group('course')
3100                         info = {
3101                                 'id': course,
3102                                 'type': 'playlist',
3103                                 'uploader': None,
3104                                 'upload_date': None,
3105                         }
3106
3107                         self.report_download_webpage(info['id'])
3108                         try:
3109                                 coursepage = compat_urllib_request.urlopen(url).read()
3110                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3111                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3112                                 return
3113
3114                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3115                         if m:
3116                                 info['title'] = unescapeHTML(m.group(1))
3117                         else:
3118                                 info['title'] = info['id']
3119
3120                         m = re.search('<description>([^<]+)</description>', coursepage)
3121                         if m:
3122                                 info['description'] = unescapeHTML(m.group(1))
3123
3124                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3125                         info['list'] = [
3126                                 {
3127                                         'type': 'reference',
3128                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3129                                 }
3130                                         for vpage in links]
3131                         results = []
3132                         for entry in info['list']:
3133                                 assert entry['type'] == 'reference'
3134                                 results += self.extract(entry['url'])
3135                         return results
3136
3137                 else: # Root page
3138                         info = {
3139                                 'id': 'Stanford OpenClassroom',
3140                                 'type': 'playlist',
3141                                 'uploader': None,
3142                                 'upload_date': None,
3143                         }
3144
3145                         self.report_download_webpage(info['id'])
3146                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3147                         try:
3148                                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3149                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3150                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3151                                 return
3152
3153                         info['title'] = info['id']
3154
3155                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3156                         info['list'] = [
3157                                 {
3158                                         'type': 'reference',
3159                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3160                                 }
3161                                         for cpage in links]
3162
3163                         results = []
3164                         for entry in info['list']:
3165                                 assert entry['type'] == 'reference'
3166                                 results += self.extract(entry['url'])
3167                         return results
3168
3169 class MTVIE(InfoExtractor):
3170         """Information extractor for MTV.com"""
3171
3172         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3173         IE_NAME = u'mtv'
3174
3175         def report_webpage(self, video_id):
3176                 """Report information extraction."""
3177                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3178
3179         def report_extraction(self, video_id):
3180                 """Report information extraction."""
3181                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3182
3183         def _real_extract(self, url):
3184                 mobj = re.match(self._VALID_URL, url)
3185                 if mobj is None:
3186                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3187                         return
3188                 if not mobj.group('proto'):
3189                         url = 'http://' + url
3190                 video_id = mobj.group('videoid')
3191                 self.report_webpage(video_id)
3192
3193                 request = compat_urllib_request.Request(url)
3194                 try:
3195                         webpage = compat_urllib_request.urlopen(request).read()
3196                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3197                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3198                         return
3199
3200                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3201                 if mobj is None:
3202                         self._downloader.trouble(u'ERROR: unable to extract song name')
3203                         return
3204                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3205                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3206                 if mobj is None:
3207                         self._downloader.trouble(u'ERROR: unable to extract performer')
3208                         return
3209                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3210                 video_title = performer + ' - ' + song_name
3211
3212                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3213                 if mobj is None:
3214                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3215                         return
3216                 mtvn_uri = mobj.group(1)
3217
3218                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3219                 if mobj is None:
3220                         self._downloader.trouble(u'ERROR: unable to extract content id')
3221                         return
3222                 content_id = mobj.group(1)
3223
3224                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3225                 self.report_extraction(video_id)
3226                 request = compat_urllib_request.Request(videogen_url)
3227                 try:
3228                         metadataXml = compat_urllib_request.urlopen(request).read()
3229                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3230                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3231                         return
3232
3233                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3234                 renditions = mdoc.findall('.//rendition')
3235
3236                 # For now, always pick the highest quality.
3237                 rendition = renditions[-1]
3238
3239                 try:
3240                         _,_,ext = rendition.attrib['type'].partition('/')
3241                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3242                         video_url = rendition.find('./src').text
3243                 except KeyError:
3244                         self._downloader.trouble('Invalid rendition field.')
3245                         return
3246
3247                 info = {
3248                         'id': video_id,
3249                         'url': video_url,
3250                         'uploader': performer,
3251                         'upload_date': None,
3252                         'title': video_title,
3253                         'ext': ext,
3254                         'format': format,
3255                 }
3256
3257                 return [info]
3258
3259
3260 class YoukuIE(InfoExtractor):
3261
3262         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3263         IE_NAME = u'Youku'
3264
3265         def __init__(self, downloader=None):
3266                 InfoExtractor.__init__(self, downloader)
3267
3268         def report_download_webpage(self, file_id):
3269                 """Report webpage download."""
3270                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3271
3272         def report_extraction(self, file_id):
3273                 """Report information extraction."""
3274                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3275
3276         def _gen_sid(self):
3277                 nowTime = int(time.time() * 1000)
3278                 random1 = random.randint(1000,1998)
3279                 random2 = random.randint(1000,9999)
3280
3281                 return "%d%d%d" %(nowTime,random1,random2)
3282
3283         def _get_file_ID_mix_string(self, seed):
3284                 mixed = []
3285                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3286                 seed = float(seed)
3287                 for i in range(len(source)):
3288                         seed  =  (seed * 211 + 30031 ) % 65536
3289                         index  =  math.floor(seed / 65536 * len(source) )
3290                         mixed.append(source[int(index)])
3291                         source.remove(source[int(index)])
3292                 #return ''.join(mixed)
3293                 return mixed
3294
3295         def _get_file_id(self, fileId, seed):
3296                 mixed = self._get_file_ID_mix_string(seed)
3297                 ids = fileId.split('*')
3298                 realId = []
3299                 for ch in ids:
3300                         if ch:
3301                                 realId.append(mixed[int(ch)])
3302                 return ''.join(realId)
3303
3304         def _real_extract(self, url):
3305                 mobj = re.match(self._VALID_URL, url)
3306                 if mobj is None:
3307                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3308                         return
3309                 video_id = mobj.group('ID')
3310
3311                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3312
3313                 request = compat_urllib_request.Request(info_url, None, std_headers)
3314                 try:
3315                         self.report_download_webpage(video_id)
3316                         jsondata = compat_urllib_request.urlopen(request).read()
3317                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3318                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3319                         return
3320
3321                 self.report_extraction(video_id)
3322                 try:
3323                         config = json.loads(jsondata)
3324
3325                         video_title =  config['data'][0]['title']
3326                         seed = config['data'][0]['seed']
3327
3328                         format = self._downloader.params.get('format', None)
3329                         supported_format = config['data'][0]['streamfileids'].keys()
3330
3331                         if format is None or format == 'best':
3332                                 if 'hd2' in supported_format:
3333                                         format = 'hd2'
3334                                 else:
3335                                         format = 'flv'
3336                                 ext = u'flv'
3337                         elif format == 'worst':
3338                                 format = 'mp4'
3339                                 ext = u'mp4'
3340                         else:
3341                                 format = 'flv'
3342                                 ext = u'flv'
3343
3344
3345                         fileid = config['data'][0]['streamfileids'][format]
3346                         seg_number = len(config['data'][0]['segs'][format])
3347
3348                         keys=[]
3349                         for i in xrange(seg_number):
3350                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3351
3352                         #TODO check error
3353                         #youku only could be viewed from mainland china
3354                 except:
3355                         self._downloader.trouble(u'ERROR: unable to extract info section')
3356                         return
3357
3358                 files_info=[]
3359                 sid = self._gen_sid()
3360                 fileid = self._get_file_id(fileid, seed)
3361
3362                 #column 8,9 of fileid represent the segment number
3363                 #fileid[7:9] should be changed
3364                 for index, key in enumerate(keys):
3365
3366                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3367                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3368
3369                         info = {
3370                                 'id': '%s_part%02d' % (video_id, index),
3371                                 'url': download_url,
3372                                 'uploader': None,
3373                                 'upload_date': None,
3374                                 'title': video_title,
3375                                 'ext': ext,
3376                         }
3377                         files_info.append(info)
3378
3379                 return files_info
3380
3381
3382 class XNXXIE(InfoExtractor):
3383         """Information extractor for xnxx.com"""
3384
3385         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3386         IE_NAME = u'xnxx'
3387         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3388         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3389         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3390
3391         def report_webpage(self, video_id):
3392                 """Report information extraction"""
3393                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3394
3395         def report_extraction(self, video_id):
3396                 """Report information extraction"""
3397                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3398
3399         def _real_extract(self, url):
3400                 mobj = re.match(self._VALID_URL, url)
3401                 if mobj is None:
3402                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403                         return
3404                 video_id = mobj.group(1).decode('utf-8')
3405
3406                 self.report_webpage(video_id)
3407
3408                 # Get webpage content
3409                 try:
3410                         webpage = compat_urllib_request.urlopen(url).read()
3411                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3413                         return
3414
3415                 result = re.search(self.VIDEO_URL_RE, webpage)
3416                 if result is None:
3417                         self._downloader.trouble(u'ERROR: unable to extract video url')
3418                         return
3419                 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3420
3421                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3422                 if result is None:
3423                         self._downloader.trouble(u'ERROR: unable to extract video title')
3424                         return
3425                 video_title = result.group(1).decode('utf-8')
3426
3427                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3428                 if result is None:
3429                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3430                         return
3431                 video_thumbnail = result.group(1).decode('utf-8')
3432
3433                 return [{
3434                         'id': video_id,
3435                         'url': video_url,
3436                         'uploader': None,
3437                         'upload_date': None,
3438                         'title': video_title,
3439                         'ext': 'flv',
3440                         'thumbnail': video_thumbnail,
3441                         'description': None,
3442                 }]
3443
3444
3445 class GooglePlusIE(InfoExtractor):
3446         """Information extractor for plus.google.com."""
3447
3448         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3449         IE_NAME = u'plus.google'
3450
3451         def __init__(self, downloader=None):
3452                 InfoExtractor.__init__(self, downloader)
3453
3454         def report_extract_entry(self, url):
3455                 """Report downloading extry"""
3456                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3457
3458         def report_date(self, upload_date):
3459                 """Report downloading extry"""
3460                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3461
3462         def report_uploader(self, uploader):
3463                 """Report downloading extry"""
3464                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3465
3466         def report_title(self, video_title):
3467                 """Report downloading extry"""
3468                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3469
3470         def report_extract_vid_page(self, video_page):
3471                 """Report information extraction."""
3472                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3473
3474         def _real_extract(self, url):
3475                 # Extract id from URL
3476                 mobj = re.match(self._VALID_URL, url)
3477                 if mobj is None:
3478                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3479                         return
3480
3481                 post_url = mobj.group(0)
3482                 video_id = mobj.group(2)
3483
3484                 video_extension = 'flv'
3485
3486                 # Step 1, Retrieve post webpage to extract further information
3487                 self.report_extract_entry(post_url)
3488                 request = compat_urllib_request.Request(post_url)
3489                 try:
3490                         webpage = compat_urllib_request.urlopen(request).read()
3491                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3492                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3493                         return
3494
3495                 # Extract update date
3496                 upload_date = None
3497                 pattern = 'title="Timestamp">(.*?)</a>'
3498                 mobj = re.search(pattern, webpage)
3499                 if mobj:
3500                         upload_date = mobj.group(1)
3501                         # Convert timestring to a format suitable for filename
3502                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3503                         upload_date = upload_date.strftime('%Y%m%d')
3504                 self.report_date(upload_date)
3505
3506                 # Extract uploader
3507                 uploader = None
3508                 pattern = r'rel\="author".*?>(.*?)</a>'
3509                 mobj = re.search(pattern, webpage)
3510                 if mobj:
3511                         uploader = mobj.group(1)
3512                 self.report_uploader(uploader)
3513
3514                 # Extract title
3515                 # Get the first line for title
3516                 video_title = u'NA'
3517                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3518                 mobj = re.search(pattern, webpage)
3519                 if mobj:
3520                         video_title = mobj.group(1)
3521                 self.report_title(video_title)
3522
3523                 # Step 2, Stimulate clicking the image box to launch video
3524                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3525                 mobj = re.search(pattern, webpage)
3526                 if mobj is None:
3527                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3528
3529                 video_page = mobj.group(1)
3530                 request = compat_urllib_request.Request(video_page)
3531                 try:
3532                         webpage = compat_urllib_request.urlopen(request).read()
3533                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3535                         return
3536                 self.report_extract_vid_page(video_page)
3537
3538
3539                 # Extract video links on video page
3540                 """Extract video links of all sizes"""
3541                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3542                 mobj = re.findall(pattern, webpage)
3543                 if len(mobj) == 0:
3544                         self._downloader.trouble(u'ERROR: unable to extract video links')
3545
3546                 # Sort in resolution
3547                 links = sorted(mobj)
3548
3549                 # Choose the lowest of the sort, i.e. highest resolution
3550                 video_url = links[-1]
3551                 # Only get the url. The resolution part in the tuple has no use anymore
3552                 video_url = video_url[-1]
3553                 # Treat escaped \u0026 style hex
3554                 video_url = unicode(video_url, "unicode_escape")
3555
3556
3557                 return [{
3558                         'id':           video_id.decode('utf-8'),
3559                         'url':          video_url,
3560                         'uploader':     uploader.decode('utf-8'),
3561                         'upload_date':  upload_date.decode('utf-8'),
3562                         'title':        video_title.decode('utf-8'),
3563                         'ext':          video_extension.decode('utf-8'),
3564                 }]