git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import netrc
   6 import os
   7 import re
   8 import socket
   9 import time
  10 import email.utils
  11 import xml.etree.ElementTree
  12 import random
  13 import math
  14 from urlparse import parse_qs
  15
  16 from utils import *
  17
  18
  19 class InfoExtractor(object):
  20         """Information Extractor class.
  21
  22         Information extractors are the classes that, given a URL, extract
  23         information about the video (or videos) the URL refers to. This
  24         information includes the real video URL, the video title, author and
  25         others. The information is stored in a dictionary which is then
  26         passed to the FileDownloader. The FileDownloader processes this
  27         information possibly downloading the video to the file system, among
  28         other possible outcomes.
  29
  30         The dictionaries must include the following fields:
  31
  32         id:             Video identifier.
  33         url:            Final video URL.
  34         uploader:       Nickname of the video uploader, unescaped.
  35         upload_date:    Video upload date (YYYYMMDD).
  36         title:          Video title, unescaped.
  37         ext:            Video filename extension.
  38
  39         The following fields are optional:
  40
  41         format:         The video format, defaults to ext (used for --get-format)
  42         thumbnail:      Full URL to a video thumbnail image.
  43         description:    One-line video description.
  44         player_url:     SWF Player URL (used for rtmpdump).
  45         subtitles:      The .srt file contents.
  46         urlhandle:              [internal] The urlHandle to be used to download the file,
  47                         like returned by urllib.request.urlopen
  48
  49         The fields should all be Unicode strings.
  50
  51         Subclasses of this one should re-define the _real_initialize() and
  52         _real_extract() methods and define a _VALID_URL regexp.
  53         Probably, they should also be added to the list of extractors.
  54
  55         _real_extract() must return a *list* of information dictionaries as
  56         described above.
  57
  58         Finally, the _WORKING attribute should be set to False for broken IEs
  59         in order to warn the users and skip the tests.
  60         """
  61
  62         _ready = False
  63         _downloader = None
  64         _WORKING = True
  65
  66         def __init__(self, downloader=None):
  67                 """Constructor. Receives an optional downloader."""
  68                 self._ready = False
  69                 self.set_downloader(downloader)
  70
  71         def suitable(self, url):
  72                 """Receives a URL and returns True if suitable for this IE."""
  73                 return re.match(self._VALID_URL, url) is not None
  74
  75         def working(self):
  76                 """Getter method for _WORKING."""
  77                 return self._WORKING
  78
  79         def initialize(self):
  80                 """Initializes an instance (authentication, etc)."""
  81                 if not self._ready:
  82                         self._real_initialize()
  83                         self._ready = True
  84
  85         def extract(self, url):
  86                 """Extracts URL information and returns it in list of dicts."""
  87                 self.initialize()
  88                 return self._real_extract(url)
  89
  90         def set_downloader(self, downloader):
  91                 """Sets the downloader for this IE."""
  92                 self._downloader = downloader
  93
  94         def _real_initialize(self):
  95                 """Real initialization process. Redefine in subclasses."""
  96                 pass
  97
  98         def _real_extract(self, url):
  99                 """Real extraction process. Redefine in subclasses."""
 100                 pass
 101
 102
 103 class YoutubeIE(InfoExtractor):
 104         """Information extractor for youtube.com."""
 105
 106         _VALID_URL = r"""^
 107                          (
 108                              (?:https?://)?                                       # http(s):// (optional)
 109                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 110                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 111                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 112                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 113                              (?:                                                  # the various things that can precede the ID:
 114                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 115                                  |(?:                                             # or the v= param in all its forms
 116                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 117                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 118                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 119                                      v=
 120                                  )
 121                              )?                                                   # optional -> youtube.com/xxxx is OK
 122                          )?                                                       # all until now is optional -> you can pass the naked ID
 123                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 124                          (?(1).+)?                                                # if we found the ID, everything can follow
 125                          $"""
 126         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 127         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 128         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 129         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 130         _NETRC_MACHINE = 'youtube'
 131         # Listed in order of quality
 132         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 133         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 134         _video_extensions = {
 135                 '13': '3gp',
 136                 '17': 'mp4',
 137                 '18': 'mp4',
 138                 '22': 'mp4',
 139                 '37': 'mp4',
 140                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 141                 '43': 'webm',
 142                 '44': 'webm',
 143                 '45': 'webm',
 144                 '46': 'webm',
 145         }
 146         _video_dimensions = {
 147                 '5': '240x400',
 148                 '6': '???',
 149                 '13': '???',
 150                 '17': '144x176',
 151                 '18': '360x640',
 152                 '22': '720x1280',
 153                 '34': '360x640',
 154                 '35': '480x854',
 155                 '37': '1080x1920',
 156                 '38': '3072x4096',
 157                 '43': '360x640',
 158                 '44': '480x854',
 159                 '45': '720x1280',
 160                 '46': '1080x1920',
 161         }
 162         IE_NAME = u'youtube'
 163
 164         def suitable(self, url):
 165                 """Receives a URL and returns True if suitable for this IE."""
 166                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 167
 168         def report_lang(self):
 169                 """Report attempt to set language."""
 170                 self._downloader.to_screen(u'[youtube] Setting language')
 171
 172         def report_login(self):
 173                 """Report attempt to log in."""
 174                 self._downloader.to_screen(u'[youtube] Logging in')
 175
 176         def report_age_confirmation(self):
 177                 """Report attempt to confirm age."""
 178                 self._downloader.to_screen(u'[youtube] Confirming age')
 179
 180         def report_video_webpage_download(self, video_id):
 181                 """Report attempt to download video webpage."""
 182                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 183
 184         def report_video_info_webpage_download(self, video_id):
 185                 """Report attempt to download video info webpage."""
 186                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 187
 188         def report_video_subtitles_download(self, video_id):
 189                 """Report attempt to download video info webpage."""
 190                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 191
 192         def report_information_extraction(self, video_id):
 193                 """Report attempt to extract video information."""
 194                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 195
 196         def report_unavailable_format(self, video_id, format):
 197                 """Report extracted video URL."""
 198                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 199
 200         def report_rtmp_download(self):
 201                 """Indicate the download will use the RTMP protocol."""
 202                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 203
 204         def _closed_captions_xml_to_srt(self, xml_string):
 205                 srt = ''
 206                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 207                 # TODO parse xml instead of regex
 208                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 209                         if not dur: dur = '4'
 210                         start = float(start)
 211                         end = start + float(dur)
 212                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 213                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 214                         caption = unescapeHTML(caption)
 215                         caption = unescapeHTML(caption) # double cycle, intentional
 216                         srt += str(n+1) + '\n'
 217                         srt += start + ' --> ' + end + '\n'
 218                         srt += caption + '\n\n'
 219                 return srt
 220
 221         def _print_formats(self, formats):
 222                 print('Available formats:')
 223                 for x in formats:
 224                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 225
 226         def _real_initialize(self):
 227                 if self._downloader is None:
 228                         return
 229
 230                 username = None
 231                 password = None
 232                 downloader_params = self._downloader.params
 233
 234                 # Attempt to use provided username and password or .netrc data
 235                 if downloader_params.get('username', None) is not None:
 236                         username = downloader_params['username']
 237                         password = downloader_params['password']
 238                 elif downloader_params.get('usenetrc', False):
 239                         try:
 240                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 241                                 if info is not None:
 242                                         username = info[0]
 243                                         password = info[2]
 244                                 else:
 245                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 246                         except (IOError, netrc.NetrcParseError) as err:
 247                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 248                                 return
 249
 250                 # Set language
 251                 request = compat_urllib_request.Request(self._LANG_URL)
 252                 try:
 253                         self.report_lang()
 254                         compat_urllib_request.urlopen(request).read()
 255                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 256                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 257                         return
 258
 259                 # No authentication to be performed
 260                 if username is None:
 261                         return
 262
 263                 # Log in
 264                 login_form = {
 265                                 'current_form': 'loginForm',
 266                                 'next':         '/',
 267                                 'action_login': 'Log In',
 268                                 'username':     username,
 269                                 'password':     password,
 270                                 }
 271                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 272                 try:
 273                         self.report_login()
 274                         login_results = compat_urllib_request.urlopen(request).read()
 275                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 276                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 277                                 return
 278                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 279                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 280                         return
 281
 282                 # Confirm age
 283                 age_form = {
 284                                 'next_url':             '/',
 285                                 'action_confirm':       'Confirm',
 286                                 }
 287                 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 288                 try:
 289                         self.report_age_confirmation()
 290                         age_results = compat_urllib_request.urlopen(request).read()
 291                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 292                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 293                         return
 294
 295         def _real_extract(self, url):
 296                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 297                 mobj = re.search(self._NEXT_URL_RE, url)
 298                 if mobj:
 299                         url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 300
 301                 # Extract video id from URL
 302                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 303                 if mobj is None:
 304                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 305                         return
 306                 video_id = mobj.group(2)
 307
 308                 # Get video webpage
 309                 self.report_video_webpage_download(video_id)
 310                 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 311                 try:
 312                         video_webpage = compat_urllib_request.urlopen(request).read()
 313                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 314                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 315                         return
 316
 317                 # Attempt to extract SWF player URL
 318                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 319                 if mobj is not None:
 320                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 321                 else:
 322                         player_url = None
 323
 324                 # Get video info
 325                 self.report_video_info_webpage_download(video_id)
 326                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 327                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 328                                         % (video_id, el_type))
 329                         request = compat_urllib_request.Request(video_info_url)
 330                         try:
 331                                 video_info_webpage = compat_urllib_request.urlopen(request).read()
 332                                 video_info = parse_qs(video_info_webpage)
 333                                 if 'token' in video_info:
 334                                         break
 335                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 336                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 337                                 return
 338                 if 'token' not in video_info:
 339                         if 'reason' in video_info:
 340                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 341                         else:
 342                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 343                         return
 344
 345                 # Check for "rental" videos
 346                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 347                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 348                         return
 349
 350                 # Start extracting information
 351                 self.report_information_extraction(video_id)
 352
 353                 # uploader
 354                 if 'author' not in video_info:
 355                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 356                         return
 357                 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 358
 359                 # title
 360                 if 'title' not in video_info:
 361                         self._downloader.trouble(u'ERROR: unable to extract video title')
 362                         return
 363                 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 364                 video_title = video_title.decode('utf-8')
 365
 366                 # thumbnail image
 367                 if 'thumbnail_url' not in video_info:
 368                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 369                         video_thumbnail = ''
 370                 else:   # don't panic if we can't find it
 371                         video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 372
 373                 # upload date
 374                 upload_date = None
 375                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 376                 if mobj is not None:
 377                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 378                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 379                         for expression in format_expressions:
 380                                 try:
 381                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 382                                 except:
 383                                         pass
 384
 385                 # description
 386                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 387                 if video_description: video_description = clean_html(video_description)
 388                 else: video_description = ''
 389
 390                 # closed captions
 391                 video_subtitles = None
 392                 if self._downloader.params.get('writesubtitles', False):
 393                         try:
 394                                 self.report_video_subtitles_download(video_id)
 395                                 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 396                                 try:
 397                                         srt_list = compat_urllib_request.urlopen(request).read()
 398                                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 400                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 401                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 402                                 if not srt_lang_list:
 403                                         raise Trouble(u'WARNING: video has no closed captions')
 404                                 if self._downloader.params.get('subtitleslang', False):
 405                                         srt_lang = self._downloader.params.get('subtitleslang')
 406                                 elif 'en' in srt_lang_list:
 407                                         srt_lang = 'en'
 408                                 else:
 409                                         srt_lang = srt_lang_list.keys()[0]
 410                                 if not srt_lang in srt_lang_list:
 411                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 412                                 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 413                                 try:
 414                                         srt_xml = compat_urllib_request.urlopen(request).read()
 415                                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 416                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 417                                 if not srt_xml:
 418                                         raise Trouble(u'WARNING: unable to download video subtitles')
 419                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 420                         except Trouble as trouble:
 421                                 self._downloader.trouble(trouble[0])
 422
 423                 if 'length_seconds' not in video_info:
 424                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 425                         video_duration = ''
 426                 else:
 427                         video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 428
 429                 # token
 430                 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 431
 432                 # Decide which formats to download
 433                 req_format = self._downloader.params.get('format', None)
 434
 435                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 436                         self.report_rtmp_download()
 437                         video_url_list = [(None, video_info['conn'][0])]
 438                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 439                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 440                         url_data = [parse_qs(uds) for uds in url_data_strs]
 441                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 442                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 443
 444                         format_limit = self._downloader.params.get('format_limit', None)
 445                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 446                         if format_limit is not None and format_limit in available_formats:
 447                                 format_list = available_formats[available_formats.index(format_limit):]
 448                         else:
 449                                 format_list = available_formats
 450                         existing_formats = [x for x in format_list if x in url_map]
 451                         if len(existing_formats) == 0:
 452                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 453                                 return
 454                         if self._downloader.params.get('listformats', None):
 455                                 self._print_formats(existing_formats)
 456                                 return
 457                         if req_format is None or req_format == 'best':
 458                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 459                         elif req_format == 'worst':
 460                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 461                         elif req_format in ('-1', 'all'):
 462                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 463                         else:
 464                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 465                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 466                                 req_formats = req_format.split('/')
 467                                 video_url_list = None
 468                                 for rf in req_formats:
 469                                         if rf in url_map:
 470                                                 video_url_list = [(rf, url_map[rf])]
 471                                                 break
 472                                 if video_url_list is None:
 473                                         self._downloader.trouble(u'ERROR: requested format not available')
 474                                         return
 475                 else:
 476                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 477                         return
 478
 479                 results = []
 480                 for format_param, video_real_url in video_url_list:
 481                         # Extension
 482                         video_extension = self._video_extensions.get(format_param, 'flv')
 483
 484                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
 485                                                             self._video_dimensions.get(format_param, '???'))
 486
 487                         results.append({
 488                                 'id':           video_id.decode('utf-8'),
 489                                 'url':          video_real_url.decode('utf-8'),
 490                                 'uploader':     video_uploader.decode('utf-8'),
 491                                 'upload_date':  upload_date,
 492                                 'title':        video_title,
 493                                 'ext':          video_extension.decode('utf-8'),
 494                                 'format':       video_format,
 495                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 496                                 'description':  video_description,
 497                                 'player_url':   player_url,
 498                                 'subtitles':    video_subtitles,
 499                                 'duration':             video_duration
 500                         })
 501                 return results
 502
 503
 504 class MetacafeIE(InfoExtractor):
 505         """Information Extractor for metacafe.com."""
 506
 507         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 508         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 509         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 510         IE_NAME = u'metacafe'
 511
 512         def __init__(self, downloader=None):
 513                 InfoExtractor.__init__(self, downloader)
 514
 515         def report_disclaimer(self):
 516                 """Report disclaimer retrieval."""
 517                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 518
 519         def report_age_confirmation(self):
 520                 """Report attempt to confirm age."""
 521                 self._downloader.to_screen(u'[metacafe] Confirming age')
 522
 523         def report_download_webpage(self, video_id):
 524                 """Report webpage download."""
 525                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 526
 527         def report_extraction(self, video_id):
 528                 """Report information extraction."""
 529                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 530
 531         def _real_initialize(self):
 532                 # Retrieve disclaimer
 533                 request = compat_urllib_request.Request(self._DISCLAIMER)
 534                 try:
 535                         self.report_disclaimer()
 536                         disclaimer = compat_urllib_request.urlopen(request).read()
 537                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 538                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 539                         return
 540
 541                 # Confirm age
 542                 disclaimer_form = {
 543                         'filters': '0',
 544                         'submit': "Continue - I'm over 18",
 545                         }
 546                 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 547                 try:
 548                         self.report_age_confirmation()
 549                         disclaimer = compat_urllib_request.urlopen(request).read()
 550                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 551                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 552                         return
 553
 554         def _real_extract(self, url):
 555                 # Extract id and simplified title from URL
 556                 mobj = re.match(self._VALID_URL, url)
 557                 if mobj is None:
 558                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 559                         return
 560
 561                 video_id = mobj.group(1)
 562
 563                 # Check if video comes from YouTube
 564                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 565                 if mobj2 is not None:
 566                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 567                         return
 568
 569                 # Retrieve video webpage to extract further information
 570                 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 571                 try:
 572                         self.report_download_webpage(video_id)
 573                         webpage = compat_urllib_request.urlopen(request).read()
 574                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 575                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 576                         return
 577
 578                 # Extract URL, uploader and title from webpage
 579                 self.report_extraction(video_id)
 580                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 581                 if mobj is not None:
 582                         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 583                         video_extension = mediaURL[-3:]
 584
 585                         # Extract gdaKey if available
 586                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 587                         if mobj is None:
 588                                 video_url = mediaURL
 589                         else:
 590                                 gdaKey = mobj.group(1)
 591                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 592                 else:
 593                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 594                         if mobj is None:
 595                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 596                                 return
 597                         vardict = parse_qs(mobj.group(1))
 598                         if 'mediaData' not in vardict:
 599                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 600                                 return
 601                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 602                         if mobj is None:
 603                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 604                                 return
 605                         mediaURL = mobj.group(1).replace('\\/', '/')
 606                         video_extension = mediaURL[-3:]
 607                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 608
 609                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 610                 if mobj is None:
 611                         self._downloader.trouble(u'ERROR: unable to extract title')
 612                         return
 613                 video_title = mobj.group(1).decode('utf-8')
 614
 615                 mobj = re.search(r'submitter=(.*?);', webpage)
 616                 if mobj is None:
 617                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 618                         return
 619                 video_uploader = mobj.group(1)
 620
 621                 return [{
 622                         'id':           video_id.decode('utf-8'),
 623                         'url':          video_url.decode('utf-8'),
 624                         'uploader':     video_uploader.decode('utf-8'),
 625                         'upload_date':  None,
 626                         'title':        video_title,
 627                         'ext':          video_extension.decode('utf-8'),
 628                 }]
 629
 630
 631 class DailymotionIE(InfoExtractor):
 632         """Information Extractor for Dailymotion"""
 633
 634         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 635         IE_NAME = u'dailymotion'
 636
 637         def __init__(self, downloader=None):
 638                 InfoExtractor.__init__(self, downloader)
 639
 640         def report_download_webpage(self, video_id):
 641                 """Report webpage download."""
 642                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 643
 644         def report_extraction(self, video_id):
 645                 """Report information extraction."""
 646                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 647
 648         def _real_extract(self, url):
 649                 # Extract id and simplified title from URL
 650                 mobj = re.match(self._VALID_URL, url)
 651                 if mobj is None:
 652                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 653                         return
 654
 655                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 656
 657                 video_extension = 'mp4'
 658
 659                 # Retrieve video webpage to extract further information
 660                 request = compat_urllib_request.Request(url)
 661                 request.add_header('Cookie', 'family_filter=off')
 662                 try:
 663                         self.report_download_webpage(video_id)
 664                         webpage = compat_urllib_request.urlopen(request).read()
 665                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 666                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 667                         return
 668
 669                 # Extract URL, uploader and title from webpage
 670                 self.report_extraction(video_id)
 671                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 672                 if mobj is None:
 673                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 674                         return
 675                 flashvars = compat_urllib_parse.unquote(mobj.group(1))
 676
 677                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 678                         if key in flashvars:
 679                                 max_quality = key
 680                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 681                                 break
 682                 else:
 683                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 684                         return
 685
 686                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 687                 if mobj is None:
 688                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 689                         return
 690
 691                 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 692
 693                 # TODO: support choosing qualities
 694
 695                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 696                 if mobj is None:
 697                         self._downloader.trouble(u'ERROR: unable to extract title')
 698                         return
 699                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 700
 701                 video_uploader = None
 702                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 703                 if mobj is None:
 704                         # lookin for official user
 705                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 706                         if mobj_official is None:
 707                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 708                         else:
 709                                 video_uploader = mobj_official.group(1)
 710                 else:
 711                         video_uploader = mobj.group(1)
 712
 713                 video_upload_date = None
 714                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 715                 if mobj is not None:
 716                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 717
 718                 return [{
 719                         'id':           video_id.decode('utf-8'),
 720                         'url':          video_url.decode('utf-8'),
 721                         'uploader':     video_uploader.decode('utf-8'),
 722                         'upload_date':  video_upload_date,
 723                         'title':        video_title,
 724                         'ext':          video_extension.decode('utf-8'),
 725                 }]
 726
 727
 728 class GoogleIE(InfoExtractor):
 729         """Information extractor for video.google.com."""
 730
 731         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 732         IE_NAME = u'video.google'
 733
 734         def __init__(self, downloader=None):
 735                 InfoExtractor.__init__(self, downloader)
 736
 737         def report_download_webpage(self, video_id):
 738                 """Report webpage download."""
 739                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 740
 741         def report_extraction(self, video_id):
 742                 """Report information extraction."""
 743                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 744
 745         def _real_extract(self, url):
 746                 # Extract id from URL
 747                 mobj = re.match(self._VALID_URL, url)
 748                 if mobj is None:
 749                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 750                         return
 751
 752                 video_id = mobj.group(1)
 753
 754                 video_extension = 'mp4'
 755
 756                 # Retrieve video webpage to extract further information
 757                 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 758                 try:
 759                         self.report_download_webpage(video_id)
 760                         webpage = compat_urllib_request.urlopen(request).read()
 761                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 762                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 763                         return
 764
 765                 # Extract URL, uploader, and title from webpage
 766                 self.report_extraction(video_id)
 767                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 768                 if mobj is None:
 769                         video_extension = 'flv'
 770                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 771                 if mobj is None:
 772                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 773                         return
 774                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 775                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 776                 mediaURL = mediaURL.replace('\\x26', '\x26')
 777
 778                 video_url = mediaURL
 779
 780                 mobj = re.search(r'<title>(.*)</title>', webpage)
 781                 if mobj is None:
 782                         self._downloader.trouble(u'ERROR: unable to extract title')
 783                         return
 784                 video_title = mobj.group(1).decode('utf-8')
 785
 786                 # Extract video description
 787                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 788                 if mobj is None:
 789                         self._downloader.trouble(u'ERROR: unable to extract video description')
 790                         return
 791                 video_description = mobj.group(1).decode('utf-8')
 792                 if not video_description:
 793                         video_description = 'No description available.'
 794
 795                 # Extract video thumbnail
 796                 if self._downloader.params.get('forcethumbnail', False):
 797                         request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 798                         try:
 799                                 webpage = compat_urllib_request.urlopen(request).read()
 800                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 801                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 802                                 return
 803                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 804                         if mobj is None:
 805                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 806                                 return
 807                         video_thumbnail = mobj.group(1)
 808                 else:   # we need something to pass to process_info
 809                         video_thumbnail = ''
 810
 811                 return [{
 812                         'id':           video_id.decode('utf-8'),
 813                         'url':          video_url.decode('utf-8'),
 814                         'uploader':     None,
 815                         'upload_date':  None,
 816                         'title':        video_title,
 817                         'ext':          video_extension.decode('utf-8'),
 818                 }]
 819
 820
 821 class PhotobucketIE(InfoExtractor):
 822         """Information extractor for photobucket.com."""
 823
 824         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 825         IE_NAME = u'photobucket'
 826
 827         def __init__(self, downloader=None):
 828                 InfoExtractor.__init__(self, downloader)
 829
 830         def report_download_webpage(self, video_id):
 831                 """Report webpage download."""
 832                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 833
 834         def report_extraction(self, video_id):
 835                 """Report information extraction."""
 836                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 837
 838         def _real_extract(self, url):
 839                 # Extract id from URL
 840                 mobj = re.match(self._VALID_URL, url)
 841                 if mobj is None:
 842                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 843                         return
 844
 845                 video_id = mobj.group(1)
 846
 847                 video_extension = 'flv'
 848
 849                 # Retrieve video webpage to extract further information
 850                 request = compat_urllib_request.Request(url)
 851                 try:
 852                         self.report_download_webpage(video_id)
 853                         webpage = compat_urllib_request.urlopen(request).read()
 854                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 855                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 856                         return
 857
 858                 # Extract URL, uploader, and title from webpage
 859                 self.report_extraction(video_id)
 860                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 861                 if mobj is None:
 862                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 863                         return
 864                 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 865
 866                 video_url = mediaURL
 867
 868                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 869                 if mobj is None:
 870                         self._downloader.trouble(u'ERROR: unable to extract title')
 871                         return
 872                 video_title = mobj.group(1).decode('utf-8')
 873
 874                 video_uploader = mobj.group(2).decode('utf-8')
 875
 876                 return [{
 877                         'id':           video_id.decode('utf-8'),
 878                         'url':          video_url.decode('utf-8'),
 879                         'uploader':     video_uploader,
 880                         'upload_date':  None,
 881                         'title':        video_title,
 882                         'ext':          video_extension.decode('utf-8'),
 883                 }]
 884
 885
 886 class YahooIE(InfoExtractor):
 887         """Information extractor for video.yahoo.com."""
 888
 889         # _VALID_URL matches all Yahoo! Video URLs
 890         # _VPAGE_URL matches only the extractable '/watch/' URLs
 891         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 892         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 893         IE_NAME = u'video.yahoo'
 894
 895         def __init__(self, downloader=None):
 896                 InfoExtractor.__init__(self, downloader)
 897
 898         def report_download_webpage(self, video_id):
 899                 """Report webpage download."""
 900                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 901
 902         def report_extraction(self, video_id):
 903                 """Report information extraction."""
 904                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 905
 906         def _real_extract(self, url, new_video=True):
 907                 # Extract ID from URL
 908                 mobj = re.match(self._VALID_URL, url)
 909                 if mobj is None:
 910                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 911                         return
 912
 913                 video_id = mobj.group(2)
 914                 video_extension = 'flv'
 915
 916                 # Rewrite valid but non-extractable URLs as
 917                 # extractable English language /watch/ URLs
 918                 if re.match(self._VPAGE_URL, url) is None:
 919                         request = compat_urllib_request.Request(url)
 920                         try:
 921                                 webpage = compat_urllib_request.urlopen(request).read()
 922                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 923                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 924                                 return
 925
 926                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 927                         if mobj is None:
 928                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 929                                 return
 930                         yahoo_id = mobj.group(1)
 931
 932                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 933                         if mobj is None:
 934                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 935                                 return
 936                         yahoo_vid = mobj.group(1)
 937
 938                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 939                         return self._real_extract(url, new_video=False)
 940
 941                 # Retrieve video webpage to extract further information
 942                 request = compat_urllib_request.Request(url)
 943                 try:
 944                         self.report_download_webpage(video_id)
 945                         webpage = compat_urllib_request.urlopen(request).read()
 946                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 947                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 948                         return
 949
 950                 # Extract uploader and title from webpage
 951                 self.report_extraction(video_id)
 952                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 953                 if mobj is None:
 954                         self._downloader.trouble(u'ERROR: unable to extract video title')
 955                         return
 956                 video_title = mobj.group(1).decode('utf-8')
 957
 958                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 959                 if mobj is None:
 960                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 961                         return
 962                 video_uploader = mobj.group(1).decode('utf-8')
 963
 964                 # Extract video thumbnail
 965                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 966                 if mobj is None:
 967                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 968                         return
 969                 video_thumbnail = mobj.group(1).decode('utf-8')
 970
 971                 # Extract video description
 972                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 973                 if mobj is None:
 974                         self._downloader.trouble(u'ERROR: unable to extract video description')
 975                         return
 976                 video_description = mobj.group(1).decode('utf-8')
 977                 if not video_description:
 978                         video_description = 'No description available.'
 979
 980                 # Extract video height and width
 981                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 982                 if mobj is None:
 983                         self._downloader.trouble(u'ERROR: unable to extract video height')
 984                         return
 985                 yv_video_height = mobj.group(1)
 986
 987                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 988                 if mobj is None:
 989                         self._downloader.trouble(u'ERROR: unable to extract video width')
 990                         return
 991                 yv_video_width = mobj.group(1)
 992
 993                 # Retrieve video playlist to extract media URL
 994                 # I'm not completely sure what all these options are, but we
 995                 # seem to need most of them, otherwise the server sends a 401.
 996                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 997                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 998                 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 999                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1000                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1001                 try:
1002                         self.report_download_webpage(video_id)
1003                         webpage = compat_urllib_request.urlopen(request).read()
1004                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1006                         return
1007
1008                 # Extract media URL from playlist XML
1009                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1010                 if mobj is None:
1011                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1012                         return
1013                 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1014                 video_url = unescapeHTML(video_url)
1015
1016                 return [{
1017                         'id':           video_id.decode('utf-8'),
1018                         'url':          video_url,
1019                         'uploader':     video_uploader,
1020                         'upload_date':  None,
1021                         'title':        video_title,
1022                         'ext':          video_extension.decode('utf-8'),
1023                         'thumbnail':    video_thumbnail.decode('utf-8'),
1024                         'description':  video_description,
1025                 }]
1026
1027
1028 class VimeoIE(InfoExtractor):
1029         """Information extractor for vimeo.com."""
1030
1031         # _VALID_URL matches Vimeo URLs
1032         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1033         IE_NAME = u'vimeo'
1034
1035         def __init__(self, downloader=None):
1036                 InfoExtractor.__init__(self, downloader)
1037
1038         def report_download_webpage(self, video_id):
1039                 """Report webpage download."""
1040                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1041
1042         def report_extraction(self, video_id):
1043                 """Report information extraction."""
1044                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1045
1046         def _real_extract(self, url, new_video=True):
1047                 # Extract ID from URL
1048                 mobj = re.match(self._VALID_URL, url)
1049                 if mobj is None:
1050                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1051                         return
1052
1053                 video_id = mobj.group(1)
1054
1055                 # Retrieve video webpage to extract further information
1056                 request = compat_urllib_request.Request(url, None, std_headers)
1057                 try:
1058                         self.report_download_webpage(video_id)
1059                         webpage = compat_urllib_request.urlopen(request).read()
1060                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1062                         return
1063
1064                 # Now we begin extracting as much information as we can from what we
1065                 # retrieved. First we extract the information common to all extractors,
1066                 # and latter we extract those that are Vimeo specific.
1067                 self.report_extraction(video_id)
1068
1069                 # Extract the config JSON
1070                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1071                 try:
1072                         config = json.loads(config)
1073                 except:
1074                         self._downloader.trouble(u'ERROR: unable to extract info section')
1075                         return
1076
1077                 # Extract title
1078                 video_title = config["video"]["title"]
1079
1080                 # Extract uploader
1081                 video_uploader = config["video"]["owner"]["name"]
1082
1083                 # Extract video thumbnail
1084                 video_thumbnail = config["video"]["thumbnail"]
1085
1086                 # Extract video description
1087                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1088                 if video_description: video_description = clean_html(video_description)
1089                 else: video_description = ''
1090
1091                 # Extract upload date
1092                 video_upload_date = None
1093                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1094                 if mobj is not None:
1095                         video_upload_date = mobj.group(1)
1096
1097                 # Vimeo specific: extract request signature and timestamp
1098                 sig = config['request']['signature']
1099                 timestamp = config['request']['timestamp']
1100
1101                 # Vimeo specific: extract video codec and quality information
1102                 # First consider quality, then codecs, then take everything
1103                 # TODO bind to format param
1104                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1105                 files = { 'hd': [], 'sd': [], 'other': []}
1106                 for codec_name, codec_extension in codecs:
1107                         if codec_name in config["video"]["files"]:
1108                                 if 'hd' in config["video"]["files"][codec_name]:
1109                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1110                                 elif 'sd' in config["video"]["files"][codec_name]:
1111                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1112                                 else:
1113                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1114
1115                 for quality in ('hd', 'sd', 'other'):
1116                         if len(files[quality]) > 0:
1117                                 video_quality = files[quality][0][2]
1118                                 video_codec = files[quality][0][0]
1119                                 video_extension = files[quality][0][1]
1120                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1121                                 break
1122                 else:
1123                         self._downloader.trouble(u'ERROR: no known codec found')
1124                         return
1125
1126                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1127                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1128
1129                 return [{
1130                         'id':           video_id,
1131                         'url':          video_url,
1132                         'uploader':     video_uploader,
1133                         'upload_date':  video_upload_date,
1134                         'title':        video_title,
1135                         'ext':          video_extension,
1136                         'thumbnail':    video_thumbnail,
1137                         'description':  video_description,
1138                 }]
1139
1140
1141 class ArteTvIE(InfoExtractor):
1142         """arte.tv information extractor."""
1143
1144         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1145         _LIVE_URL = r'index-[0-9]+\.html$'
1146
1147         IE_NAME = u'arte.tv'
1148
1149         def __init__(self, downloader=None):
1150                 InfoExtractor.__init__(self, downloader)
1151
1152         def report_download_webpage(self, video_id):
1153                 """Report webpage download."""
1154                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1155
1156         def report_extraction(self, video_id):
1157                 """Report information extraction."""
1158                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1159
1160         def fetch_webpage(self, url):
1161                 self._downloader.increment_downloads()
1162                 request = compat_urllib_request.Request(url)
1163                 try:
1164                         self.report_download_webpage(url)
1165                         webpage = compat_urllib_request.urlopen(request).read()
1166                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1167                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1168                         return
1169                 except ValueError as err:
1170                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1171                         return
1172                 return webpage
1173
1174         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1175                 page = self.fetch_webpage(url)
1176                 mobj = re.search(regex, page, regexFlags)
1177                 info = {}
1178
1179                 if mobj is None:
1180                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181                         return
1182
1183                 for (i, key, err) in matchTuples:
1184                         if mobj.group(i) is None:
1185                                 self._downloader.trouble(err)
1186                                 return
1187                         else:
1188                                 info[key] = mobj.group(i)
1189
1190                 return info
1191
1192         def extractLiveStream(self, url):
1193                 video_lang = url.split('/')[-4]
1194                 info = self.grep_webpage(
1195                         url,
1196                         r'src="(.*?/videothek_js.*?\.js)',
1197                         0,
1198                         [
1199                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1200                         ]
1201                 )
1202                 http_host = url.split('/')[2]
1203                 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1204                 info = self.grep_webpage(
1205                         next_url,
1206                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1207                                 '(http://.*?\.swf).*?' +
1208                                 '(rtmp://.*?)\'',
1209                         re.DOTALL,
1210                         [
1211                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1212                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1213                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1214                         ]
1215                 )
1216                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1217
1218         def extractPlus7Stream(self, url):
1219                 video_lang = url.split('/')[-3]
1220                 info = self.grep_webpage(
1221                         url,
1222                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1223                         0,
1224                         [
1225                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1226                         ]
1227                 )
1228                 next_url = compat_urllib_parse.unquote(info.get('url'))
1229                 info = self.grep_webpage(
1230                         next_url,
1231                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1232                         0,
1233                         [
1234                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1235                         ]
1236                 )
1237                 next_url = compat_urllib_parse.unquote(info.get('url'))
1238
1239                 info = self.grep_webpage(
1240                         next_url,
1241                         r'<video id="(.*?)".*?>.*?' +
1242                                 '<name>(.*?)</name>.*?' +
1243                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1244                                 '<url quality="hd">(.*?)</url>',
1245                         re.DOTALL,
1246                         [
1247                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1248                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1249                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1250                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1251                         ]
1252                 )
1253
1254                 return {
1255                         'id':           info.get('id'),
1256                         'url':          compat_urllib_parse.unquote(info.get('url')),
1257                         'uploader':     u'arte.tv',
1258                         'upload_date':  info.get('date'),
1259                         'title':        info.get('title'),
1260                         'ext':          u'mp4',
1261                         'format':       u'NA',
1262                         'player_url':   None,
1263                 }
1264
1265         def _real_extract(self, url):
1266                 video_id = url.split('/')[-1]
1267                 self.report_extraction(video_id)
1268
1269                 if re.search(self._LIVE_URL, video_id) is not None:
1270                         self.extractLiveStream(url)
1271                         return
1272                 else:
1273                         info = self.extractPlus7Stream(url)
1274
1275                 return [info]
1276
1277
1278 class GenericIE(InfoExtractor):
1279         """Generic last-resort information extractor."""
1280
1281         _VALID_URL = r'.*'
1282         IE_NAME = u'generic'
1283
1284         def __init__(self, downloader=None):
1285                 InfoExtractor.__init__(self, downloader)
1286
1287         def report_download_webpage(self, video_id):
1288                 """Report webpage download."""
1289                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1290                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1291
1292         def report_extraction(self, video_id):
1293                 """Report information extraction."""
1294                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1295
1296         def report_following_redirect(self, new_url):
1297                 """Report information extraction."""
1298                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1299
1300         def _test_redirect(self, url):
1301                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1302                 class HeadRequest(compat_urllib_request.Request):
1303                         def get_method(self):
1304                                 return "HEAD"
1305
1306                 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1307                         """
1308                         Subclass the HTTPRedirectHandler to make it use our
1309                         HeadRequest also on the redirected URL
1310                         """
1311                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1312                                 if code in (301, 302, 303, 307):
1313                                         newurl = newurl.replace(' ', '%20')
1314                                         newheaders = dict((k,v) for k,v in req.headers.items()
1315                                                                           if k.lower() not in ("content-length", "content-type"))
1316                                         return HeadRequest(newurl,
1317                                                                            headers=newheaders,
1318                                                                            origin_req_host=req.get_origin_req_host(),
1319                                                                            unverifiable=True)
1320                                 else:
1321                                         raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1322
1323                 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1324                         """
1325                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1326                         """
1327                         def http_error_405(self, req, fp, code, msg, headers):
1328                                 fp.read()
1329                                 fp.close()
1330
1331                                 newheaders = dict((k,v) for k,v in req.headers.items()
1332                                                                   if k.lower() not in ("content-length", "content-type"))
1333                                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1334                                                                                                  headers=newheaders,
1335                                                                                                  origin_req_host=req.get_origin_req_host(),
1336                                                                                                  unverifiable=True))
1337
1338                 # Build our opener
1339                 opener = compat_urllib_request.OpenerDirector()
1340                 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1341                                                 HTTPMethodFallback, HEADRedirectHandler,
1342                                                 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1343                         opener.add_handler(handler())
1344
1345                 response = opener.open(HeadRequest(url))
1346                 new_url = response.geturl()
1347
1348                 if url == new_url:
1349                         return False
1350
1351                 self.report_following_redirect(new_url)
1352                 self._downloader.download([new_url])
1353                 return True
1354
1355         def _real_extract(self, url):
1356                 if self._test_redirect(url): return
1357
1358                 video_id = url.split('/')[-1]
1359                 request = compat_urllib_request.Request(url)
1360                 try:
1361                         self.report_download_webpage(video_id)
1362                         webpage = compat_urllib_request.urlopen(request).read()
1363                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1364                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1365                         return
1366                 except ValueError as err:
1367                         # since this is the last-resort InfoExtractor, if
1368                         # this error is thrown, it'll be thrown here
1369                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1370                         return
1371
1372                 self.report_extraction(video_id)
1373                 # Start with something easy: JW Player in SWFObject
1374                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1375                 if mobj is None:
1376                         # Broaden the search a little bit
1377                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1378                 if mobj is None:
1379                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380                         return
1381
1382                 # It's possible that one of the regexes
1383                 # matched, but returned an empty group:
1384                 if mobj.group(1) is None:
1385                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1386                         return
1387
1388                 video_url = compat_urllib_parse.unquote(mobj.group(1))
1389                 video_id = os.path.basename(video_url)
1390
1391                 # here's a fun little line of code for you:
1392                 video_extension = os.path.splitext(video_id)[1][1:]
1393                 video_id = os.path.splitext(video_id)[0]
1394
1395                 # it's tempting to parse this further, but you would
1396                 # have to take into account all the variations like
1397                 #   Video Title - Site Name
1398                 #   Site Name | Video Title
1399                 #   Video Title - Tagline | Site Name
1400                 # and so on and so forth; it's just not practical
1401                 mobj = re.search(r'<title>(.*)</title>', webpage)
1402                 if mobj is None:
1403                         self._downloader.trouble(u'ERROR: unable to extract title')
1404                         return
1405                 video_title = mobj.group(1).decode('utf-8')
1406
1407                 # video uploader is domain name
1408                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1409                 if mobj is None:
1410                         self._downloader.trouble(u'ERROR: unable to extract title')
1411                         return
1412                 video_uploader = mobj.group(1).decode('utf-8')
1413
1414                 return [{
1415                         'id':           video_id.decode('utf-8'),
1416                         'url':          video_url.decode('utf-8'),
1417                         'uploader':     video_uploader,
1418                         'upload_date':  None,
1419                         'title':        video_title,
1420                         'ext':          video_extension.decode('utf-8'),
1421                 }]
1422
1423
1424 class YoutubeSearchIE(InfoExtractor):
1425         """Information Extractor for YouTube search queries."""
1426         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1427         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1428         _max_youtube_results = 1000
1429         IE_NAME = u'youtube:search'
1430
1431         def __init__(self, downloader=None):
1432                 InfoExtractor.__init__(self, downloader)
1433
1434         def report_download_page(self, query, pagenum):
1435                 """Report attempt to download search page with given number."""
1436                 query = query.decode(preferredencoding())
1437                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1438
1439         def _real_extract(self, query):
1440                 mobj = re.match(self._VALID_URL, query)
1441                 if mobj is None:
1442                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1443                         return
1444
1445                 prefix, query = query.split(':')
1446                 prefix = prefix[8:]
1447                 query = query.encode('utf-8')
1448                 if prefix == '':
1449                         self._download_n_results(query, 1)
1450                         return
1451                 elif prefix == 'all':
1452                         self._download_n_results(query, self._max_youtube_results)
1453                         return
1454                 else:
1455                         try:
1456                                 n = int(prefix)
1457                                 if n <= 0:
1458                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1459                                         return
1460                                 elif n > self._max_youtube_results:
1461                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1462                                         n = self._max_youtube_results
1463                                 self._download_n_results(query, n)
1464                                 return
1465                         except ValueError: # parsing prefix as integer fails
1466                                 self._download_n_results(query, 1)
1467                                 return
1468
1469         def _download_n_results(self, query, n):
1470                 """Downloads a specified number of results for a query"""
1471
1472                 video_ids = []
1473                 pagenum = 0
1474                 limit = n
1475
1476                 while (50 * pagenum) < limit:
1477                         self.report_download_page(query, pagenum+1)
1478                         result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479                         request = compat_urllib_request.Request(result_url)
1480                         try:
1481                                 data = compat_urllib_request.urlopen(request).read()
1482                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1484                                 return
1485                         api_response = json.loads(data)['data']
1486
1487                         new_ids = list(video['id'] for video in api_response['items'])
1488                         video_ids += new_ids
1489
1490                         limit = min(n, api_response['totalItems'])
1491                         pagenum += 1
1492
1493                 if len(video_ids) > n:
1494                         video_ids = video_ids[:n]
1495                 for id in video_ids:
1496                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1497                 return
1498
1499
1500 class GoogleSearchIE(InfoExtractor):
1501         """Information Extractor for Google Video search queries."""
1502         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1503         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1504         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1505         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1506         _max_google_results = 1000
1507         IE_NAME = u'video.google:search'
1508
1509         def __init__(self, downloader=None):
1510                 InfoExtractor.__init__(self, downloader)
1511
1512         def report_download_page(self, query, pagenum):
1513                 """Report attempt to download playlist page with given number."""
1514                 query = query.decode(preferredencoding())
1515                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1516
1517         def _real_extract(self, query):
1518                 mobj = re.match(self._VALID_URL, query)
1519                 if mobj is None:
1520                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1521                         return
1522
1523                 prefix, query = query.split(':')
1524                 prefix = prefix[8:]
1525                 query = query.encode('utf-8')
1526                 if prefix == '':
1527                         self._download_n_results(query, 1)
1528                         return
1529                 elif prefix == 'all':
1530                         self._download_n_results(query, self._max_google_results)
1531                         return
1532                 else:
1533                         try:
1534                                 n = int(prefix)
1535                                 if n <= 0:
1536                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1537                                         return
1538                                 elif n > self._max_google_results:
1539                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540                                         n = self._max_google_results
1541                                 self._download_n_results(query, n)
1542                                 return
1543                         except ValueError: # parsing prefix as integer fails
1544                                 self._download_n_results(query, 1)
1545                                 return
1546
1547         def _download_n_results(self, query, n):
1548                 """Downloads a specified number of results for a query"""
1549
1550                 video_ids = []
1551                 pagenum = 0
1552
1553                 while True:
1554                         self.report_download_page(query, pagenum)
1555                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556                         request = compat_urllib_request.Request(result_url)
1557                         try:
1558                                 page = compat_urllib_request.urlopen(request).read()
1559                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1561                                 return
1562
1563                         # Extract video identifiers
1564                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565                                 video_id = mobj.group(1)
1566                                 if video_id not in video_ids:
1567                                         video_ids.append(video_id)
1568                                         if len(video_ids) == n:
1569                                                 # Specified n videos reached
1570                                                 for id in video_ids:
1571                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572                                                 return
1573
1574                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575                                 for id in video_ids:
1576                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577                                 return
1578
1579                         pagenum = pagenum + 1
1580
1581
1582 class YahooSearchIE(InfoExtractor):
1583         """Information Extractor for Yahoo! Video search queries."""
1584         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587         _MORE_PAGES_INDICATOR = r'\s*Next'
1588         _max_yahoo_results = 1000
1589         IE_NAME = u'video.yahoo:search'
1590
1591         def __init__(self, downloader=None):
1592                 InfoExtractor.__init__(self, downloader)
1593
1594         def report_download_page(self, query, pagenum):
1595                 """Report attempt to download playlist page with given number."""
1596                 query = query.decode(preferredencoding())
1597                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1598
1599         def _real_extract(self, query):
1600                 mobj = re.match(self._VALID_URL, query)
1601                 if mobj is None:
1602                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1603                         return
1604
1605                 prefix, query = query.split(':')
1606                 prefix = prefix[8:]
1607                 query = query.encode('utf-8')
1608                 if prefix == '':
1609                         self._download_n_results(query, 1)
1610                         return
1611                 elif prefix == 'all':
1612                         self._download_n_results(query, self._max_yahoo_results)
1613                         return
1614                 else:
1615                         try:
1616                                 n = int(prefix)
1617                                 if n <= 0:
1618                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619                                         return
1620                                 elif n > self._max_yahoo_results:
1621                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622                                         n = self._max_yahoo_results
1623                                 self._download_n_results(query, n)
1624                                 return
1625                         except ValueError: # parsing prefix as integer fails
1626                                 self._download_n_results(query, 1)
1627                                 return
1628
1629         def _download_n_results(self, query, n):
1630                 """Downloads a specified number of results for a query"""
1631
1632                 video_ids = []
1633                 already_seen = set()
1634                 pagenum = 1
1635
1636                 while True:
1637                         self.report_download_page(query, pagenum)
1638                         result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639                         request = compat_urllib_request.Request(result_url)
1640                         try:
1641                                 page = compat_urllib_request.urlopen(request).read()
1642                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1644                                 return
1645
1646                         # Extract video identifiers
1647                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648                                 video_id = mobj.group(1)
1649                                 if video_id not in already_seen:
1650                                         video_ids.append(video_id)
1651                                         already_seen.add(video_id)
1652                                         if len(video_ids) == n:
1653                                                 # Specified n videos reached
1654                                                 for id in video_ids:
1655                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656                                                 return
1657
1658                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659                                 for id in video_ids:
1660                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661                                 return
1662
1663                         pagenum = pagenum + 1
1664
1665
1666 class YoutubePlaylistIE(InfoExtractor):
1667         """Information Extractor for YouTube playlists."""
1668
1669         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1670         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1671         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1672         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1673         IE_NAME = u'youtube:playlist'
1674
1675         def __init__(self, downloader=None):
1676                 InfoExtractor.__init__(self, downloader)
1677
1678         def report_download_page(self, playlist_id, pagenum):
1679                 """Report attempt to download playlist page with given number."""
1680                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1681
1682         def _real_extract(self, url):
1683                 # Extract playlist id
1684                 mobj = re.match(self._VALID_URL, url)
1685                 if mobj is None:
1686                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1687                         return
1688
1689                 # Single video case
1690                 if mobj.group(3) is not None:
1691                         self._downloader.download([mobj.group(3)])
1692                         return
1693
1694                 # Download playlist pages
1695                 # prefix is 'p' as default for playlists but there are other types that need extra care
1696                 playlist_prefix = mobj.group(1)
1697                 if playlist_prefix == 'a':
1698                         playlist_access = 'artist'
1699                 else:
1700                         playlist_prefix = 'p'
1701                         playlist_access = 'view_play_list'
1702                 playlist_id = mobj.group(2)
1703                 video_ids = []
1704                 pagenum = 1
1705
1706                 while True:
1707                         self.report_download_page(playlist_id, pagenum)
1708                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1709                         request = compat_urllib_request.Request(url)
1710                         try:
1711                                 page = compat_urllib_request.urlopen(request).read()
1712                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714                                 return
1715
1716                         # Extract video identifiers
1717                         ids_in_page = []
1718                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1719                                 if mobj.group(1) not in ids_in_page:
1720                                         ids_in_page.append(mobj.group(1))
1721                         video_ids.extend(ids_in_page)
1722
1723                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1724                                 break
1725                         pagenum = pagenum + 1
1726
1727                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1728                 playlistend = self._downloader.params.get('playlistend', -1)
1729                 if playlistend == -1:
1730                         video_ids = video_ids[playliststart:]
1731                 else:
1732                         video_ids = video_ids[playliststart:playlistend]
1733
1734                 for id in video_ids:
1735                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736                 return
1737
1738
1739 class YoutubeChannelIE(InfoExtractor):
1740         """Information Extractor for YouTube channels."""
1741
1742         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1743         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1745         IE_NAME = u'youtube:channel'
1746
1747         def report_download_page(self, channel_id, pagenum):
1748                 """Report attempt to download channel page with given number."""
1749                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1750
1751         def _real_extract(self, url):
1752                 # Extract channel id
1753                 mobj = re.match(self._VALID_URL, url)
1754                 if mobj is None:
1755                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1756                         return
1757
1758                 # Download channel pages
1759                 channel_id = mobj.group(1)
1760                 video_ids = []
1761                 pagenum = 1
1762
1763                 while True:
1764                         self.report_download_page(channel_id, pagenum)
1765                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1766                         request = compat_urllib_request.Request(url)
1767                         try:
1768                                 page = compat_urllib_request.urlopen(request).read()
1769                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1770                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1771                                 return
1772
1773                         # Extract video identifiers
1774                         ids_in_page = []
1775                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1776                                 if mobj.group(1) not in ids_in_page:
1777                                         ids_in_page.append(mobj.group(1))
1778                         video_ids.extend(ids_in_page)
1779
1780                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1781                                 break
1782                         pagenum = pagenum + 1
1783
1784                 for id in video_ids:
1785                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1786                 return
1787
1788
1789 class YoutubeUserIE(InfoExtractor):
1790         """Information Extractor for YouTube users."""
1791
1792         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1793         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1794         _GDATA_PAGE_SIZE = 50
1795         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1796         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1797         IE_NAME = u'youtube:user'
1798
1799         def __init__(self, downloader=None):
1800                 InfoExtractor.__init__(self, downloader)
1801
1802         def report_download_page(self, username, start_index):
1803                 """Report attempt to download user page."""
1804                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1805                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1806
1807         def _real_extract(self, url):
1808                 # Extract username
1809                 mobj = re.match(self._VALID_URL, url)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1812                         return
1813
1814                 username = mobj.group(1)
1815
1816                 # Download video ids using YouTube Data API. Result size per
1817                 # query is limited (currently to 50 videos) so we need to query
1818                 # page by page until there are no video ids - it means we got
1819                 # all of them.
1820
1821                 video_ids = []
1822                 pagenum = 0
1823
1824                 while True:
1825                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1826                         self.report_download_page(username, start_index)
1827
1828                         request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1829
1830                         try:
1831                                 page = compat_urllib_request.urlopen(request).read()
1832                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1833                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1834                                 return
1835
1836                         # Extract video identifiers
1837                         ids_in_page = []
1838
1839                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1840                                 if mobj.group(1) not in ids_in_page:
1841                                         ids_in_page.append(mobj.group(1))
1842
1843                         video_ids.extend(ids_in_page)
1844
1845                         # A little optimization - if current page is not
1846                         # "full", ie. does not contain PAGE_SIZE video ids then
1847                         # we can assume that this page is the last one - there
1848                         # are no more ids on further pages - no need to query
1849                         # again.
1850
1851                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1852                                 break
1853
1854                         pagenum += 1
1855
1856                 all_ids_count = len(video_ids)
1857                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1858                 playlistend = self._downloader.params.get('playlistend', -1)
1859
1860                 if playlistend == -1:
1861                         video_ids = video_ids[playliststart:]
1862                 else:
1863                         video_ids = video_ids[playliststart:playlistend]
1864
1865                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1866                                 (username, all_ids_count, len(video_ids)))
1867
1868                 for video_id in video_ids:
1869                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1870
1871
1872 class BlipTVUserIE(InfoExtractor):
1873         """Information Extractor for blip.tv users."""
1874
1875         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1876         _PAGE_SIZE = 12
1877         IE_NAME = u'blip.tv:user'
1878
1879         def __init__(self, downloader=None):
1880                 InfoExtractor.__init__(self, downloader)
1881
1882         def report_download_page(self, username, pagenum):
1883                 """Report attempt to download user page."""
1884                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1885                                 (self.IE_NAME, username, pagenum))
1886
1887         def _real_extract(self, url):
1888                 # Extract username
1889                 mobj = re.match(self._VALID_URL, url)
1890                 if mobj is None:
1891                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1892                         return
1893
1894                 username = mobj.group(1)
1895
1896                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1897
1898                 request = compat_urllib_request.Request(url)
1899
1900                 try:
1901                         page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1902                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1903                         page_base = page_base % mobj.group(1)
1904                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1906                         return
1907
1908
1909                 # Download video ids using BlipTV Ajax calls. Result size per
1910                 # query is limited (currently to 12 videos) so we need to query
1911                 # page by page until there are no video ids - it means we got
1912                 # all of them.
1913
1914                 video_ids = []
1915                 pagenum = 1
1916
1917                 while True:
1918                         self.report_download_page(username, pagenum)
1919
1920                         request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1921
1922                         try:
1923                                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1924                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1925                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1926                                 return
1927
1928                         # Extract video identifiers
1929                         ids_in_page = []
1930
1931                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1932                                 if mobj.group(1) not in ids_in_page:
1933                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1934
1935                         video_ids.extend(ids_in_page)
1936
1937                         # A little optimization - if current page is not
1938                         # "full", ie. does not contain PAGE_SIZE video ids then
1939                         # we can assume that this page is the last one - there
1940                         # are no more ids on further pages - no need to query
1941                         # again.
1942
1943                         if len(ids_in_page) < self._PAGE_SIZE:
1944                                 break
1945
1946                         pagenum += 1
1947
1948                 all_ids_count = len(video_ids)
1949                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1950                 playlistend = self._downloader.params.get('playlistend', -1)
1951
1952                 if playlistend == -1:
1953                         video_ids = video_ids[playliststart:]
1954                 else:
1955                         video_ids = video_ids[playliststart:playlistend]
1956
1957                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1958                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1959
1960                 for video_id in video_ids:
1961                         self._downloader.download([u'http://blip.tv/'+video_id])
1962
1963
1964 class DepositFilesIE(InfoExtractor):
1965         """Information extractor for depositfiles.com"""
1966
1967         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1968         IE_NAME = u'DepositFiles'
1969
1970         def __init__(self, downloader=None):
1971                 InfoExtractor.__init__(self, downloader)
1972
1973         def report_download_webpage(self, file_id):
1974                 """Report webpage download."""
1975                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1976
1977         def report_extraction(self, file_id):
1978                 """Report information extraction."""
1979                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1980
1981         def _real_extract(self, url):
1982                 file_id = url.split('/')[-1]
1983                 # Rebuild url in english locale
1984                 url = 'http://depositfiles.com/en/files/' + file_id
1985
1986                 # Retrieve file webpage with 'Free download' button pressed
1987                 free_download_indication = { 'gateway_result' : '1' }
1988                 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1989                 try:
1990                         self.report_download_webpage(file_id)
1991                         webpage = compat_urllib_request.urlopen(request).read()
1992                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1993                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1994                         return
1995
1996                 # Search for the real file URL
1997                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1998                 if (mobj is None) or (mobj.group(1) is None):
1999                         # Try to figure out reason of the error.
2000                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2001                         if (mobj is not None) and (mobj.group(1) is not None):
2002                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2003                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2004                         else:
2005                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2006                         return
2007
2008                 file_url = mobj.group(1)
2009                 file_extension = os.path.splitext(file_url)[1][1:]
2010
2011                 # Search for file title
2012                 mobj = re.search(r'<b title="(.*?)">', webpage)
2013                 if mobj is None:
2014                         self._downloader.trouble(u'ERROR: unable to extract title')
2015                         return
2016                 file_title = mobj.group(1).decode('utf-8')
2017
2018                 return [{
2019                         'id':           file_id.decode('utf-8'),
2020                         'url':          file_url.decode('utf-8'),
2021                         'uploader':     None,
2022                         'upload_date':  None,
2023                         'title':        file_title,
2024                         'ext':          file_extension.decode('utf-8'),
2025                 }]
2026
2027
2028 class FacebookIE(InfoExtractor):
2029         """Information Extractor for Facebook"""
2030
2031         _WORKING = False
2032         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2033         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2034         _NETRC_MACHINE = 'facebook'
2035         _available_formats = ['video', 'highqual', 'lowqual']
2036         _video_extensions = {
2037                 'video': 'mp4',
2038                 'highqual': 'mp4',
2039                 'lowqual': 'mp4',
2040         }
2041         IE_NAME = u'facebook'
2042
2043         def __init__(self, downloader=None):
2044                 InfoExtractor.__init__(self, downloader)
2045
2046         def _reporter(self, message):
2047                 """Add header and report message."""
2048                 self._downloader.to_screen(u'[facebook] %s' % message)
2049
2050         def report_login(self):
2051                 """Report attempt to log in."""
2052                 self._reporter(u'Logging in')
2053
2054         def report_video_webpage_download(self, video_id):
2055                 """Report attempt to download video webpage."""
2056                 self._reporter(u'%s: Downloading video webpage' % video_id)
2057
2058         def report_information_extraction(self, video_id):
2059                 """Report attempt to extract video information."""
2060                 self._reporter(u'%s: Extracting video information' % video_id)
2061
2062         def _parse_page(self, video_webpage):
2063                 """Extract video information from page"""
2064                 # General data
2065                 data = {'title': r'\("video_title", "(.*?)"\)',
2066                         'description': r'<div class="datawrap">(.*?)</div>',
2067                         'owner': r'\("video_owner_name", "(.*?)"\)',
2068                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2069                         }
2070                 video_info = {}
2071                 for piece in data.keys():
2072                         mobj = re.search(data[piece], video_webpage)
2073                         if mobj is not None:
2074                                 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2075
2076                 # Video urls
2077                 video_urls = {}
2078                 for fmt in self._available_formats:
2079                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2080                         if mobj is not None:
2081                                 # URL is in a Javascript segment inside an escaped Unicode format within
2082                                 # the generally utf-8 page
2083                                 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2084                 video_info['video_urls'] = video_urls
2085
2086                 return video_info
2087
2088         def _real_initialize(self):
2089                 if self._downloader is None:
2090                         return
2091
2092                 useremail = None
2093                 password = None
2094                 downloader_params = self._downloader.params
2095
2096                 # Attempt to use provided username and password or .netrc data
2097                 if downloader_params.get('username', None) is not None:
2098                         useremail = downloader_params['username']
2099                         password = downloader_params['password']
2100                 elif downloader_params.get('usenetrc', False):
2101                         try:
2102                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2103                                 if info is not None:
2104                                         useremail = info[0]
2105                                         password = info[2]
2106                                 else:
2107                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2108                         except (IOError, netrc.NetrcParseError) as err:
2109                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2110                                 return
2111
2112                 if useremail is None:
2113                         return
2114
2115                 # Log in
2116                 login_form = {
2117                         'email': useremail,
2118                         'pass': password,
2119                         'login': 'Log+In'
2120                         }
2121                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2122                 try:
2123                         self.report_login()
2124                         login_results = compat_urllib_request.urlopen(request).read()
2125                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2126                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2127                                 return
2128                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2129                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2130                         return
2131
2132         def _real_extract(self, url):
2133                 mobj = re.match(self._VALID_URL, url)
2134                 if mobj is None:
2135                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2136                         return
2137                 video_id = mobj.group('ID')
2138
2139                 # Get video webpage
2140                 self.report_video_webpage_download(video_id)
2141                 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2142                 try:
2143                         page = compat_urllib_request.urlopen(request)
2144                         video_webpage = page.read()
2145                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2146                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2147                         return
2148
2149                 # Start extracting information
2150                 self.report_information_extraction(video_id)
2151
2152                 # Extract information
2153                 video_info = self._parse_page(video_webpage)
2154
2155                 # uploader
2156                 if 'owner' not in video_info:
2157                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2158                         return
2159                 video_uploader = video_info['owner']
2160
2161                 # title
2162                 if 'title' not in video_info:
2163                         self._downloader.trouble(u'ERROR: unable to extract video title')
2164                         return
2165                 video_title = video_info['title']
2166                 video_title = video_title.decode('utf-8')
2167
2168                 # thumbnail image
2169                 if 'thumbnail' not in video_info:
2170                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2171                         video_thumbnail = ''
2172                 else:
2173                         video_thumbnail = video_info['thumbnail']
2174
2175                 # upload date
2176                 upload_date = None
2177                 if 'upload_date' in video_info:
2178                         upload_time = video_info['upload_date']
2179                         timetuple = email.utils.parsedate_tz(upload_time)
2180                         if timetuple is not None:
2181                                 try:
2182                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2183                                 except:
2184                                         pass
2185
2186                 # description
2187                 video_description = video_info.get('description', 'No description available.')
2188
2189                 url_map = video_info['video_urls']
2190                 if len(url_map.keys()) > 0:
2191                         # Decide which formats to download
2192                         req_format = self._downloader.params.get('format', None)
2193                         format_limit = self._downloader.params.get('format_limit', None)
2194
2195                         if format_limit is not None and format_limit in self._available_formats:
2196                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2197                         else:
2198                                 format_list = self._available_formats
2199                         existing_formats = [x for x in format_list if x in url_map]
2200                         if len(existing_formats) == 0:
2201                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2202                                 return
2203                         if req_format is None:
2204                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2205                         elif req_format == 'worst':
2206                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2207                         elif req_format == '-1':
2208                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2209                         else:
2210                                 # Specific format
2211                                 if req_format not in url_map:
2212                                         self._downloader.trouble(u'ERROR: requested format not available')
2213                                         return
2214                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2215
2216                 results = []
2217                 for format_param, video_real_url in video_url_list:
2218                         # Extension
2219                         video_extension = self._video_extensions.get(format_param, 'mp4')
2220
2221                         results.append({
2222                                 'id':           video_id.decode('utf-8'),
2223                                 'url':          video_real_url.decode('utf-8'),
2224                                 'uploader':     video_uploader.decode('utf-8'),
2225                                 'upload_date':  upload_date,
2226                                 'title':        video_title,
2227                                 'ext':          video_extension.decode('utf-8'),
2228                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2229                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2230                                 'description':  video_description.decode('utf-8'),
2231                         })
2232                 return results
2233
2234 class BlipTVIE(InfoExtractor):
2235         """Information extractor for blip.tv"""
2236
2237         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2238         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2239         IE_NAME = u'blip.tv'
2240
2241         def report_extraction(self, file_id):
2242                 """Report information extraction."""
2243                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2244
2245         def report_direct_download(self, title):
2246                 """Report information extraction."""
2247                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2248
2249         def _real_extract(self, url):
2250                 mobj = re.match(self._VALID_URL, url)
2251                 if mobj is None:
2252                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2253                         return
2254
2255                 if '?' in url:
2256                         cchar = '&'
2257                 else:
2258                         cchar = '?'
2259                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2260                 request = compat_urllib_request.Request(json_url.encode('utf-8'))
2261                 self.report_extraction(mobj.group(1))
2262                 info = None
2263                 try:
2264                         urlh = compat_urllib_request.urlopen(request)
2265                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2266                                 basename = url.split('/')[-1]
2267                                 title,ext = os.path.splitext(basename)
2268                                 title = title.decode('UTF-8')
2269                                 ext = ext.replace('.', '')
2270                                 self.report_direct_download(title)
2271                                 info = {
2272                                         'id': title,
2273                                         'url': url,
2274                                         'uploader': None,
2275                                         'upload_date': None,
2276                                         'title': title,
2277                                         'ext': ext,
2278                                         'urlhandle': urlh
2279                                 }
2280                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2281                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2282                         return
2283                 if info is None: # Regular URL
2284                         try:
2285                                 json_code = urlh.read()
2286                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2287                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2288                                 return
2289
2290                         try:
2291                                 json_data = json.loads(json_code)
2292                                 if 'Post' in json_data:
2293                                         data = json_data['Post']
2294                                 else:
2295                                         data = json_data
2296
2297                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2298                                 video_url = data['media']['url']
2299                                 umobj = re.match(self._URL_EXT, video_url)
2300                                 if umobj is None:
2301                                         raise ValueError('Can not determine filename extension')
2302                                 ext = umobj.group(1)
2303
2304                                 info = {
2305                                         'id': data['item_id'],
2306                                         'url': video_url,
2307                                         'uploader': data['display_name'],
2308                                         'upload_date': upload_date,
2309                                         'title': data['title'],
2310                                         'ext': ext,
2311                                         'format': data['media']['mimeType'],
2312                                         'thumbnail': data['thumbnailUrl'],
2313                                         'description': data['description'],
2314                                         'player_url': data['embedUrl']
2315                                 }
2316                         except (ValueError,KeyError) as err:
2317                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2318                                 return
2319
2320                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2321                 return [info]
2322
2323
2324 class MyVideoIE(InfoExtractor):
2325         """Information Extractor for myvideo.de."""
2326
2327         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2328         IE_NAME = u'myvideo'
2329
2330         def __init__(self, downloader=None):
2331                 InfoExtractor.__init__(self, downloader)
2332
2333         def report_download_webpage(self, video_id):
2334                 """Report webpage download."""
2335                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2336
2337         def report_extraction(self, video_id):
2338                 """Report information extraction."""
2339                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2340
2341         def _real_extract(self,url):
2342                 mobj = re.match(self._VALID_URL, url)
2343                 if mobj is None:
2344                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2345                         return
2346
2347                 video_id = mobj.group(1)
2348
2349                 # Get video webpage
2350                 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2351                 try:
2352                         self.report_download_webpage(video_id)
2353                         webpage = compat_urllib_request.urlopen(request).read()
2354                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2356                         return
2357
2358                 self.report_extraction(video_id)
2359                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2360                                  webpage)
2361                 if mobj is None:
2362                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2363                         return
2364                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2365
2366                 mobj = re.search('<title>([^<]+)</title>', webpage)
2367                 if mobj is None:
2368                         self._downloader.trouble(u'ERROR: unable to extract title')
2369                         return
2370
2371                 video_title = mobj.group(1)
2372
2373                 return [{
2374                         'id':           video_id,
2375                         'url':          video_url,
2376                         'uploader':     None,
2377                         'upload_date':  None,
2378                         'title':        video_title,
2379                         'ext':          u'flv',
2380                 }]
2381
2382 class ComedyCentralIE(InfoExtractor):
2383         """Information extractor for The Daily Show and Colbert Report """
2384
2385         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2386         IE_NAME = u'comedycentral'
2387
2388         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2389
2390         _video_extensions = {
2391                 '3500': 'mp4',
2392                 '2200': 'mp4',
2393                 '1700': 'mp4',
2394                 '1200': 'mp4',
2395                 '750': 'mp4',
2396                 '400': 'mp4',
2397         }
2398         _video_dimensions = {
2399                 '3500': '1280x720',
2400                 '2200': '960x540',
2401                 '1700': '768x432',
2402                 '1200': '640x360',
2403                 '750': '512x288',
2404                 '400': '384x216',
2405         }
2406
2407         def report_extraction(self, episode_id):
2408                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2409
2410         def report_config_download(self, episode_id):
2411                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2412
2413         def report_index_download(self, episode_id):
2414                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2415
2416         def report_player_url(self, episode_id):
2417                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2418
2419
2420         def _print_formats(self, formats):
2421                 print('Available formats:')
2422                 for x in formats:
2423                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2424
2425
2426         def _real_extract(self, url):
2427                 mobj = re.match(self._VALID_URL, url)
2428                 if mobj is None:
2429                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2430                         return
2431
2432                 if mobj.group('shortname'):
2433                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2434                                 url = u'http://www.thedailyshow.com/full-episodes/'
2435                         else:
2436                                 url = u'http://www.colbertnation.com/full-episodes/'
2437                         mobj = re.match(self._VALID_URL, url)
2438                         assert mobj is not None
2439
2440                 dlNewest = not mobj.group('episode')
2441                 if dlNewest:
2442                         epTitle = mobj.group('showname')
2443                 else:
2444                         epTitle = mobj.group('episode')
2445
2446                 req = compat_urllib_request.Request(url)
2447                 self.report_extraction(epTitle)
2448                 try:
2449                         htmlHandle = compat_urllib_request.urlopen(req)
2450                         html = htmlHandle.read()
2451                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2453                         return
2454                 if dlNewest:
2455                         url = htmlHandle.geturl()
2456                         mobj = re.match(self._VALID_URL, url)
2457                         if mobj is None:
2458                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2459                                 return
2460                         if mobj.group('episode') == '':
2461                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2462                                 return
2463                         epTitle = mobj.group('episode')
2464
2465                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2466
2467                 if len(mMovieParams) == 0:
2468                         # The Colbert Report embeds the information in a without
2469                         # a URL prefix; so extract the alternate reference
2470                         # and then add the URL prefix manually.
2471
2472                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2473                         if len(altMovieParams) == 0:
2474                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2475                                 return
2476                         else:
2477                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2478
2479                 playerUrl_raw = mMovieParams[0][0]
2480                 self.report_player_url(epTitle)
2481                 try:
2482                         urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2483                         playerUrl = urlHandle.geturl()
2484                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2486                         return
2487
2488                 uri = mMovieParams[0][1]
2489                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2490                 self.report_index_download(epTitle)
2491                 try:
2492                         indexXml = compat_urllib_request.urlopen(indexUrl).read()
2493                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2494                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2495                         return
2496
2497                 results = []
2498
2499                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2500                 itemEls = idoc.findall('.//item')
2501                 for itemEl in itemEls:
2502                         mediaId = itemEl.findall('./guid')[0].text
2503                         shortMediaId = mediaId.split(':')[-1]
2504                         showId = mediaId.split(':')[-2].replace('.com', '')
2505                         officialTitle = itemEl.findall('./title')[0].text
2506                         officialDate = itemEl.findall('./pubDate')[0].text
2507
2508                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2509                                                 compat_urllib_parse.urlencode({'uri': mediaId}))
2510                         configReq = compat_urllib_request.Request(configUrl)
2511                         self.report_config_download(epTitle)
2512                         try:
2513                                 configXml = compat_urllib_request.urlopen(configReq).read()
2514                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2515                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2516                                 return
2517
2518                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2519                         turls = []
2520                         for rendition in cdoc.findall('.//rendition'):
2521                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2522                                 turls.append(finfo)
2523
2524                         if len(turls) == 0:
2525                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2526                                 continue
2527
2528                         if self._downloader.params.get('listformats', None):
2529                                 self._print_formats([i[0] for i in turls])
2530                                 return
2531
2532                         # For now, just pick the highest bitrate
2533                         format,video_url = turls[-1]
2534
2535                         # Get the format arg from the arg stream
2536                         req_format = self._downloader.params.get('format', None)
2537
2538                         # Select format if we can find one
2539                         for f,v in turls:
2540                                 if f == req_format:
2541                                         format, video_url = f, v
2542                                         break
2543
2544                         # Patch to download from alternative CDN, which does not
2545                         # break on current RTMPDump builds
2546                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2547                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2548
2549                         if video_url.startswith(broken_cdn):
2550                                 video_url = video_url.replace(broken_cdn, better_cdn)
2551
2552                         effTitle = showId + u'-' + epTitle
2553                         info = {
2554                                 'id': shortMediaId,
2555                                 'url': video_url,
2556                                 'uploader': showId,
2557                                 'upload_date': officialDate,
2558                                 'title': effTitle,
2559                                 'ext': 'mp4',
2560                                 'format': format,
2561                                 'thumbnail': None,
2562                                 'description': officialTitle,
2563                                 'player_url': None #playerUrl
2564                         }
2565
2566                         results.append(info)
2567
2568                 return results
2569
2570
2571 class EscapistIE(InfoExtractor):
2572         """Information extractor for The Escapist """
2573
2574         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2575         IE_NAME = u'escapist'
2576
2577         def report_extraction(self, showName):
2578                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2579
2580         def report_config_download(self, showName):
2581                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2582
2583         def _real_extract(self, url):
2584                 mobj = re.match(self._VALID_URL, url)
2585                 if mobj is None:
2586                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2587                         return
2588                 showName = mobj.group('showname')
2589                 videoId = mobj.group('episode')
2590
2591                 self.report_extraction(showName)
2592                 try:
2593                         webPage = compat_urllib_request.urlopen(url)
2594                         webPageBytes = webPage.read()
2595                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2596                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2597                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2598                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2599                         return
2600
2601                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2602                 description = unescapeHTML(descMatch.group(1))
2603                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2604                 imgUrl = unescapeHTML(imgMatch.group(1))
2605                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2606                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2607                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2608                 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2609
2610                 self.report_config_download(showName)
2611                 try:
2612                         configJSON = compat_urllib_request.urlopen(configUrl).read()
2613                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2614                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2615                         return
2616
2617                 # Technically, it's JavaScript, not JSON
2618                 configJSON = configJSON.replace("'", '"')
2619
2620                 try:
2621                         config = json.loads(configJSON)
2622                 except (ValueError,) as err:
2623                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2624                         return
2625
2626                 playlist = config['playlist']
2627                 videoUrl = playlist[1]['url']
2628
2629                 info = {
2630                         'id': videoId,
2631                         'url': videoUrl,
2632                         'uploader': showName,
2633                         'upload_date': None,
2634                         'title': showName,
2635                         'ext': 'flv',
2636                         'thumbnail': imgUrl,
2637                         'description': description,
2638                         'player_url': playerUrl,
2639                 }
2640
2641                 return [info]
2642
2643
2644 class CollegeHumorIE(InfoExtractor):
2645         """Information extractor for collegehumor.com"""
2646
2647         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2648         IE_NAME = u'collegehumor'
2649
2650         def report_webpage(self, video_id):
2651                 """Report information extraction."""
2652                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2653
2654         def report_extraction(self, video_id):
2655                 """Report information extraction."""
2656                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2657
2658         def _real_extract(self, url):
2659                 mobj = re.match(self._VALID_URL, url)
2660                 if mobj is None:
2661                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2662                         return
2663                 video_id = mobj.group('videoid')
2664
2665                 self.report_webpage(video_id)
2666                 request = compat_urllib_request.Request(url)
2667                 try:
2668                         webpage = compat_urllib_request.urlopen(request).read()
2669                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2670                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2671                         return
2672
2673                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2674                 if m is None:
2675                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2676                         return
2677                 internal_video_id = m.group('internalvideoid')
2678
2679                 info = {
2680                         'id': video_id,
2681                         'internal_id': internal_video_id,
2682                         'uploader': None,
2683                         'upload_date': None,
2684                 }
2685
2686                 self.report_extraction(video_id)
2687                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2688                 try:
2689                         metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2690                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2692                         return
2693
2694                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2695                 try:
2696                         videoNode = mdoc.findall('./video')[0]
2697                         info['description'] = videoNode.findall('./description')[0].text
2698                         info['title'] = videoNode.findall('./caption')[0].text
2699                         info['url'] = videoNode.findall('./file')[0].text
2700                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2701                         info['ext'] = info['url'].rpartition('.')[2]
2702                 except IndexError:
2703                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2704                         return
2705
2706                 return [info]
2707
2708
2709 class XVideosIE(InfoExtractor):
2710         """Information extractor for xvideos.com"""
2711
2712         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2713         IE_NAME = u'xvideos'
2714
2715         def report_webpage(self, video_id):
2716                 """Report information extraction."""
2717                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2718
2719         def report_extraction(self, video_id):
2720                 """Report information extraction."""
2721                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2722
2723         def _real_extract(self, url):
2724                 mobj = re.match(self._VALID_URL, url)
2725                 if mobj is None:
2726                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2727                         return
2728                 video_id = mobj.group(1).decode('utf-8')
2729
2730                 self.report_webpage(video_id)
2731
2732                 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2733                 try:
2734                         webpage = compat_urllib_request.urlopen(request).read()
2735                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2737                         return
2738
2739                 self.report_extraction(video_id)
2740
2741
2742                 # Extract video URL
2743                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2744                 if mobj is None:
2745                         self._downloader.trouble(u'ERROR: unable to extract video url')
2746                         return
2747                 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2748
2749
2750                 # Extract title
2751                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2752                 if mobj is None:
2753                         self._downloader.trouble(u'ERROR: unable to extract video title')
2754                         return
2755                 video_title = mobj.group(1).decode('utf-8')
2756
2757
2758                 # Extract video thumbnail
2759                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2760                 if mobj is None:
2761                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2762                         return
2763                 video_thumbnail = mobj.group(0).decode('utf-8')
2764
2765                 info = {
2766                         'id': video_id,
2767                         'url': video_url,
2768                         'uploader': None,
2769                         'upload_date': None,
2770                         'title': video_title,
2771                         'ext': 'flv',
2772                         'thumbnail': video_thumbnail,
2773                         'description': None,
2774                 }
2775
2776                 return [info]
2777
2778
2779 class SoundcloudIE(InfoExtractor):
2780         """Information extractor for soundcloud.com
2781            To access the media, the uid of the song and a stream token
2782            must be extracted from the page source and the script must make
2783            a request to media.soundcloud.com/crossdomain.xml. Then
2784            the media can be grabbed by requesting from an url composed
2785            of the stream token and uid
2786          """
2787
2788         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2789         IE_NAME = u'soundcloud'
2790
2791         def __init__(self, downloader=None):
2792                 InfoExtractor.__init__(self, downloader)
2793
2794         def report_webpage(self, video_id):
2795                 """Report information extraction."""
2796                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2797
2798         def report_extraction(self, video_id):
2799                 """Report information extraction."""
2800                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2801
2802         def _real_extract(self, url):
2803                 mobj = re.match(self._VALID_URL, url)
2804                 if mobj is None:
2805                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806                         return
2807
2808                 # extract uploader (which is in the url)
2809                 uploader = mobj.group(1).decode('utf-8')
2810                 # extract simple title (uploader + slug of song title)
2811                 slug_title =  mobj.group(2).decode('utf-8')
2812                 simple_title = uploader + u'-' + slug_title
2813
2814                 self.report_webpage('%s/%s' % (uploader, slug_title))
2815
2816                 request = compat_urllib_request.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2817                 try:
2818                         webpage = compat_urllib_request.urlopen(request).read()
2819                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2821                         return
2822
2823                 self.report_extraction('%s/%s' % (uploader, slug_title))
2824
2825                 # extract uid and stream token that soundcloud hands out for access
2826                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2827                 if mobj:
2828                         video_id = mobj.group(1)
2829                         stream_token = mobj.group(2)
2830
2831                 # extract unsimplified title
2832                 mobj = re.search('"title":"(.*?)",', webpage)
2833                 if mobj:
2834                         title = mobj.group(1).decode('utf-8')
2835                 else:
2836                         title = simple_title
2837
2838                 # construct media url (with uid/token)
2839                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2840                 mediaURL = mediaURL % (video_id, stream_token)
2841
2842                 # description
2843                 description = u'No description available'
2844                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2845                 if mobj:
2846                         description = mobj.group(1)
2847
2848                 # upload date
2849                 upload_date = None
2850                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2851                 if mobj:
2852                         try:
2853                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2854                         except Exception, e:
2855                                 self._downloader.to_stderr(compat_str(e))
2856
2857                 # for soundcloud, a request to a cross domain is required for cookies
2858                 request = compat_urllib_request.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2859
2860                 return [{
2861                         'id':           video_id.decode('utf-8'),
2862                         'url':          mediaURL,
2863                         'uploader':     uploader.decode('utf-8'),
2864                         'upload_date':  upload_date,
2865                         'title':        title,
2866                         'ext':          u'mp3',
2867                         'description': description.decode('utf-8')
2868                 }]
2869
2870
2871 class InfoQIE(InfoExtractor):
2872         """Information extractor for infoq.com"""
2873
2874         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2875         IE_NAME = u'infoq'
2876
2877         def report_webpage(self, video_id):
2878                 """Report information extraction."""
2879                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2880
2881         def report_extraction(self, video_id):
2882                 """Report information extraction."""
2883                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2884
2885         def _real_extract(self, url):
2886                 mobj = re.match(self._VALID_URL, url)
2887                 if mobj is None:
2888                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2889                         return
2890
2891                 self.report_webpage(url)
2892
2893                 request = compat_urllib_request.Request(url)
2894                 try:
2895                         webpage = compat_urllib_request.urlopen(request).read()
2896                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2897                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2898                         return
2899
2900                 self.report_extraction(url)
2901
2902
2903                 # Extract video URL
2904                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2905                 if mobj is None:
2906                         self._downloader.trouble(u'ERROR: unable to extract video url')
2907                         return
2908                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2909
2910
2911                 # Extract title
2912                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2913                 if mobj is None:
2914                         self._downloader.trouble(u'ERROR: unable to extract video title')
2915                         return
2916                 video_title = mobj.group(1).decode('utf-8')
2917
2918                 # Extract description
2919                 video_description = u'No description available.'
2920                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2921                 if mobj is not None:
2922                         video_description = mobj.group(1).decode('utf-8')
2923
2924                 video_filename = video_url.split('/')[-1]
2925                 video_id, extension = video_filename.split('.')
2926
2927                 info = {
2928                         'id': video_id,
2929                         'url': video_url,
2930                         'uploader': None,
2931                         'upload_date': None,
2932                         'title': video_title,
2933                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2934                         'thumbnail': None,
2935                         'description': video_description,
2936                 }
2937
2938                 return [info]
2939
2940 class MixcloudIE(InfoExtractor):
2941         """Information extractor for www.mixcloud.com"""
2942         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2943         IE_NAME = u'mixcloud'
2944
2945         def __init__(self, downloader=None):
2946                 InfoExtractor.__init__(self, downloader)
2947
2948         def report_download_json(self, file_id):
2949                 """Report JSON download."""
2950                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2951
2952         def report_extraction(self, file_id):
2953                 """Report information extraction."""
2954                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2955
2956         def get_urls(self, jsonData, fmt, bitrate='best'):
2957                 """Get urls from 'audio_formats' section in json"""
2958                 file_url = None
2959                 try:
2960                         bitrate_list = jsonData[fmt]
2961                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2962                                 bitrate = max(bitrate_list) # select highest
2963
2964                         url_list = jsonData[fmt][bitrate]
2965                 except TypeError: # we have no bitrate info.
2966                         url_list = jsonData[fmt]
2967                 return url_list
2968
2969         def check_urls(self, url_list):
2970                 """Returns 1st active url from list"""
2971                 for url in url_list:
2972                         try:
2973                                 compat_urllib_request.urlopen(url)
2974                                 return url
2975                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2976                                 url = None
2977
2978                 return None
2979
2980         def _print_formats(self, formats):
2981                 print('Available formats:')
2982                 for fmt in formats.keys():
2983                         for b in formats[fmt]:
2984                                 try:
2985                                         ext = formats[fmt][b][0]
2986                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2987                                 except TypeError: # we have no bitrate info
2988                                         ext = formats[fmt][0]
2989                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2990                                         break
2991
2992         def _real_extract(self, url):
2993                 mobj = re.match(self._VALID_URL, url)
2994                 if mobj is None:
2995                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2996                         return
2997                 # extract uploader & filename from url
2998                 uploader = mobj.group(1).decode('utf-8')
2999                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3000
3001                 # construct API request
3002                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3003                 # retrieve .json file with links to files
3004                 request = compat_urllib_request.Request(file_url)
3005                 try:
3006                         self.report_download_json(file_url)
3007                         jsonData = compat_urllib_request.urlopen(request).read()
3008                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3010                         return
3011
3012                 # parse JSON
3013                 json_data = json.loads(jsonData)
3014                 player_url = json_data['player_swf_url']
3015                 formats = dict(json_data['audio_formats'])
3016
3017                 req_format = self._downloader.params.get('format', None)
3018                 bitrate = None
3019
3020                 if self._downloader.params.get('listformats', None):
3021                         self._print_formats(formats)
3022                         return
3023
3024                 if req_format is None or req_format == 'best':
3025                         for format_param in formats.keys():
3026                                 url_list = self.get_urls(formats, format_param)
3027                                 # check urls
3028                                 file_url = self.check_urls(url_list)
3029                                 if file_url is not None:
3030                                         break # got it!
3031                 else:
3032                         if req_format not in formats.keys():
3033                                 self._downloader.trouble(u'ERROR: format is not available')
3034                                 return
3035
3036                         url_list = self.get_urls(formats, req_format)
3037                         file_url = self.check_urls(url_list)
3038                         format_param = req_format
3039
3040                 return [{
3041                         'id': file_id.decode('utf-8'),
3042                         'url': file_url.decode('utf-8'),
3043                         'uploader':     uploader.decode('utf-8'),
3044                         'upload_date': None,
3045                         'title': json_data['name'],
3046                         'ext': file_url.split('.')[-1].decode('utf-8'),
3047                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3048                         'thumbnail': json_data['thumbnail_url'],
3049                         'description': json_data['description'],
3050                         'player_url': player_url.decode('utf-8'),
3051                 }]
3052
3053 class StanfordOpenClassroomIE(InfoExtractor):
3054         """Information extractor for Stanford's Open ClassRoom"""
3055
3056         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3057         IE_NAME = u'stanfordoc'
3058
3059         def report_download_webpage(self, objid):
3060                 """Report information extraction."""
3061                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3062
3063         def report_extraction(self, video_id):
3064                 """Report information extraction."""
3065                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3066
3067         def _real_extract(self, url):
3068                 mobj = re.match(self._VALID_URL, url)
3069                 if mobj is None:
3070                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3071                         return
3072
3073                 if mobj.group('course') and mobj.group('video'): # A specific video
3074                         course = mobj.group('course')
3075                         video = mobj.group('video')
3076                         info = {
3077                                 'id': course + '_' + video,
3078                                 'uploader': None,
3079                                 'upload_date': None,
3080                         }
3081
3082                         self.report_extraction(info['id'])
3083                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3084                         xmlUrl = baseUrl + video + '.xml'
3085                         try:
3086                                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3087                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3089                                 return
3090                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3091                         try:
3092                                 info['title'] = mdoc.findall('./title')[0].text
3093                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3094                         except IndexError:
3095                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3096                                 return
3097                         info['ext'] = info['url'].rpartition('.')[2]
3098                         return [info]
3099                 elif mobj.group('course'): # A course page
3100                         course = mobj.group('course')
3101                         info = {
3102                                 'id': course,
3103                                 'type': 'playlist',
3104                                 'uploader': None,
3105                                 'upload_date': None,
3106                         }
3107
3108                         self.report_download_webpage(info['id'])
3109                         try:
3110                                 coursepage = compat_urllib_request.urlopen(url).read()
3111                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3112                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3113                                 return
3114
3115                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3116                         if m:
3117                                 info['title'] = unescapeHTML(m.group(1))
3118                         else:
3119                                 info['title'] = info['id']
3120
3121                         m = re.search('<description>([^<]+)</description>', coursepage)
3122                         if m:
3123                                 info['description'] = unescapeHTML(m.group(1))
3124
3125                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3126                         info['list'] = [
3127                                 {
3128                                         'type': 'reference',
3129                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3130                                 }
3131                                         for vpage in links]
3132                         results = []
3133                         for entry in info['list']:
3134                                 assert entry['type'] == 'reference'
3135                                 results += self.extract(entry['url'])
3136                         return results
3137
3138                 else: # Root page
3139                         info = {
3140                                 'id': 'Stanford OpenClassroom',
3141                                 'type': 'playlist',
3142                                 'uploader': None,
3143                                 'upload_date': None,
3144                         }
3145
3146                         self.report_download_webpage(info['id'])
3147                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3148                         try:
3149                                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3150                         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3151                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3152                                 return
3153
3154                         info['title'] = info['id']
3155
3156                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3157                         info['list'] = [
3158                                 {
3159                                         'type': 'reference',
3160                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3161                                 }
3162                                         for cpage in links]
3163
3164                         results = []
3165                         for entry in info['list']:
3166                                 assert entry['type'] == 'reference'
3167                                 results += self.extract(entry['url'])
3168                         return results
3169
3170 class MTVIE(InfoExtractor):
3171         """Information extractor for MTV.com"""
3172
3173         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3174         IE_NAME = u'mtv'
3175
3176         def report_webpage(self, video_id):
3177                 """Report information extraction."""
3178                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3179
3180         def report_extraction(self, video_id):
3181                 """Report information extraction."""
3182                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3183
3184         def _real_extract(self, url):
3185                 mobj = re.match(self._VALID_URL, url)
3186                 if mobj is None:
3187                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3188                         return
3189                 if not mobj.group('proto'):
3190                         url = 'http://' + url
3191                 video_id = mobj.group('videoid')
3192                 self.report_webpage(video_id)
3193
3194                 request = compat_urllib_request.Request(url)
3195                 try:
3196                         webpage = compat_urllib_request.urlopen(request).read()
3197                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3198                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3199                         return
3200
3201                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3202                 if mobj is None:
3203                         self._downloader.trouble(u'ERROR: unable to extract song name')
3204                         return
3205                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3206                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3207                 if mobj is None:
3208                         self._downloader.trouble(u'ERROR: unable to extract performer')
3209                         return
3210                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211                 video_title = performer + ' - ' + song_name
3212
3213                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3214                 if mobj is None:
3215                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3216                         return
3217                 mtvn_uri = mobj.group(1)
3218
3219                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3220                 if mobj is None:
3221                         self._downloader.trouble(u'ERROR: unable to extract content id')
3222                         return
3223                 content_id = mobj.group(1)
3224
3225                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3226                 self.report_extraction(video_id)
3227                 request = compat_urllib_request.Request(videogen_url)
3228                 try:
3229                         metadataXml = compat_urllib_request.urlopen(request).read()
3230                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3231                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3232                         return
3233
3234                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3235                 renditions = mdoc.findall('.//rendition')
3236
3237                 # For now, always pick the highest quality.
3238                 rendition = renditions[-1]
3239
3240                 try:
3241                         _,_,ext = rendition.attrib['type'].partition('/')
3242                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3243                         video_url = rendition.find('./src').text
3244                 except KeyError:
3245                         self._downloader.trouble('Invalid rendition field.')
3246                         return
3247
3248                 info = {
3249                         'id': video_id,
3250                         'url': video_url,
3251                         'uploader': performer,
3252                         'upload_date': None,
3253                         'title': video_title,
3254                         'ext': ext,
3255                         'format': format,
3256                 }
3257
3258                 return [info]
3259
3260
3261 class YoukuIE(InfoExtractor):
3262
3263         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3264         IE_NAME = u'Youku'
3265
3266         def __init__(self, downloader=None):
3267                 InfoExtractor.__init__(self, downloader)
3268
3269         def report_download_webpage(self, file_id):
3270                 """Report webpage download."""
3271                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3272
3273         def report_extraction(self, file_id):
3274                 """Report information extraction."""
3275                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3276
3277         def _gen_sid(self):
3278                 nowTime = int(time.time() * 1000)
3279                 random1 = random.randint(1000,1998)
3280                 random2 = random.randint(1000,9999)
3281
3282                 return "%d%d%d" %(nowTime,random1,random2)
3283
3284         def _get_file_ID_mix_string(self, seed):
3285                 mixed = []
3286                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3287                 seed = float(seed)
3288                 for i in range(len(source)):
3289                         seed  =  (seed * 211 + 30031 ) % 65536
3290                         index  =  math.floor(seed / 65536 * len(source) )
3291                         mixed.append(source[int(index)])
3292                         source.remove(source[int(index)])
3293                 #return ''.join(mixed)
3294                 return mixed
3295
3296         def _get_file_id(self, fileId, seed):
3297                 mixed = self._get_file_ID_mix_string(seed)
3298                 ids = fileId.split('*')
3299                 realId = []
3300                 for ch in ids:
3301                         if ch:
3302                                 realId.append(mixed[int(ch)])
3303                 return ''.join(realId)
3304
3305         def _real_extract(self, url):
3306                 mobj = re.match(self._VALID_URL, url)
3307                 if mobj is None:
3308                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3309                         return
3310                 video_id = mobj.group('ID')
3311
3312                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3313
3314                 request = compat_urllib_request.Request(info_url, None, std_headers)
3315                 try:
3316                         self.report_download_webpage(video_id)
3317                         jsondata = compat_urllib_request.urlopen(request).read()
3318                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3320                         return
3321
3322                 self.report_extraction(video_id)
3323                 try:
3324                         config = json.loads(jsondata)
3325
3326                         video_title =  config['data'][0]['title']
3327                         seed = config['data'][0]['seed']
3328
3329                         format = self._downloader.params.get('format', None)
3330                         supported_format = config['data'][0]['streamfileids'].keys()
3331
3332                         if format is None or format == 'best':
3333                                 if 'hd2' in supported_format:
3334                                         format = 'hd2'
3335                                 else:
3336                                         format = 'flv'
3337                                 ext = u'flv'
3338                         elif format == 'worst':
3339                                 format = 'mp4'
3340                                 ext = u'mp4'
3341                         else:
3342                                 format = 'flv'
3343                                 ext = u'flv'
3344
3345
3346                         fileid = config['data'][0]['streamfileids'][format]
3347                         seg_number = len(config['data'][0]['segs'][format])
3348
3349                         keys=[]
3350                         for i in xrange(seg_number):
3351                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3352
3353                         #TODO check error
3354                         #youku only could be viewed from mainland china
3355                 except:
3356                         self._downloader.trouble(u'ERROR: unable to extract info section')
3357                         return
3358
3359                 files_info=[]
3360                 sid = self._gen_sid()
3361                 fileid = self._get_file_id(fileid, seed)
3362
3363                 #column 8,9 of fileid represent the segment number
3364                 #fileid[7:9] should be changed
3365                 for index, key in enumerate(keys):
3366
3367                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3368                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3369
3370                         info = {
3371                                 'id': '%s_part%02d' % (video_id, index),
3372                                 'url': download_url,
3373                                 'uploader': None,
3374                                 'upload_date': None,
3375                                 'title': video_title,
3376                                 'ext': ext,
3377                         }
3378                         files_info.append(info)
3379
3380                 return files_info
3381
3382
3383 class XNXXIE(InfoExtractor):
3384         """Information extractor for xnxx.com"""
3385
3386         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3387         IE_NAME = u'xnxx'
3388         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3389         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3390         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3391
3392         def report_webpage(self, video_id):
3393                 """Report information extraction"""
3394                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3395
3396         def report_extraction(self, video_id):
3397                 """Report information extraction"""
3398                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3399
3400         def _real_extract(self, url):
3401                 mobj = re.match(self._VALID_URL, url)
3402                 if mobj is None:
3403                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3404                         return
3405                 video_id = mobj.group(1).decode('utf-8')
3406
3407                 self.report_webpage(video_id)
3408
3409                 # Get webpage content
3410                 try:
3411                         webpage = compat_urllib_request.urlopen(url).read()
3412                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3414                         return
3415
3416                 result = re.search(self.VIDEO_URL_RE, webpage)
3417                 if result is None:
3418                         self._downloader.trouble(u'ERROR: unable to extract video url')
3419                         return
3420                 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3421
3422                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3423                 if result is None:
3424                         self._downloader.trouble(u'ERROR: unable to extract video title')
3425                         return
3426                 video_title = result.group(1).decode('utf-8')
3427
3428                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3429                 if result is None:
3430                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3431                         return
3432                 video_thumbnail = result.group(1).decode('utf-8')
3433
3434                 return [{
3435                         'id': video_id,
3436                         'url': video_url,
3437                         'uploader': None,
3438                         'upload_date': None,
3439                         'title': video_title,
3440                         'ext': 'flv',
3441                         'thumbnail': video_thumbnail,
3442                         'description': None,
3443                 }]
3444
3445
3446 class GooglePlusIE(InfoExtractor):
3447         """Information extractor for plus.google.com."""
3448
3449         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3450         IE_NAME = u'plus.google'
3451
3452         def __init__(self, downloader=None):
3453                 InfoExtractor.__init__(self, downloader)
3454
3455         def report_extract_entry(self, url):
3456                 """Report downloading extry"""
3457                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3458
3459         def report_date(self, upload_date):
3460                 """Report downloading extry"""
3461                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3462
3463         def report_uploader(self, uploader):
3464                 """Report downloading extry"""
3465                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3466
3467         def report_title(self, video_title):
3468                 """Report downloading extry"""
3469                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3470
3471         def report_extract_vid_page(self, video_page):
3472                 """Report information extraction."""
3473                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3474
3475         def _real_extract(self, url):
3476                 # Extract id from URL
3477                 mobj = re.match(self._VALID_URL, url)
3478                 if mobj is None:
3479                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3480                         return
3481
3482                 post_url = mobj.group(0)
3483                 video_id = mobj.group(2)
3484
3485                 video_extension = 'flv'
3486
3487                 # Step 1, Retrieve post webpage to extract further information
3488                 self.report_extract_entry(post_url)
3489                 request = compat_urllib_request.Request(post_url)
3490                 try:
3491                         webpage = compat_urllib_request.urlopen(request).read()
3492                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3494                         return
3495
3496                 # Extract update date
3497                 upload_date = None
3498                 pattern = 'title="Timestamp">(.*?)</a>'
3499                 mobj = re.search(pattern, webpage)
3500                 if mobj:
3501                         upload_date = mobj.group(1)
3502                         # Convert timestring to a format suitable for filename
3503                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3504                         upload_date = upload_date.strftime('%Y%m%d')
3505                 self.report_date(upload_date)
3506
3507                 # Extract uploader
3508                 uploader = None
3509                 pattern = r'rel\="author".*?>(.*?)</a>'
3510                 mobj = re.search(pattern, webpage)
3511                 if mobj:
3512                         uploader = mobj.group(1)
3513                 self.report_uploader(uploader)
3514
3515                 # Extract title
3516                 # Get the first line for title
3517                 video_title = u'NA'
3518                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3519                 mobj = re.search(pattern, webpage)
3520                 if mobj:
3521                         video_title = mobj.group(1)
3522                 self.report_title(video_title)
3523
3524                 # Step 2, Stimulate clicking the image box to launch video
3525                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3526                 mobj = re.search(pattern, webpage)
3527                 if mobj is None:
3528                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3529
3530                 video_page = mobj.group(1)
3531                 request = compat_urllib_request.Request(video_page)
3532                 try:
3533                         webpage = compat_urllib_request.urlopen(request).read()
3534                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3535                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3536                         return
3537                 self.report_extract_vid_page(video_page)
3538
3539
3540                 # Extract video links on video page
3541                 """Extract video links of all sizes"""
3542                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3543                 mobj = re.findall(pattern, webpage)
3544                 if len(mobj) == 0:
3545                         self._downloader.trouble(u'ERROR: unable to extract video links')
3546
3547                 # Sort in resolution
3548                 links = sorted(mobj)
3549
3550                 # Choose the lowest of the sort, i.e. highest resolution
3551                 video_url = links[-1]
3552                 # Only get the url. The resolution part in the tuple has no use anymore
3553                 video_url = video_url[-1]
3554                 # Treat escaped \u0026 style hex
3555                 video_url = unicode(video_url, "unicode_escape")
3556
3557
3558                 return [{
3559                         'id':           video_id.decode('utf-8'),
3560                         'url':          video_url,
3561                         'uploader':     uploader.decode('utf-8'),
3562                         'upload_date':  upload_date.decode('utf-8'),
3563                         'title':        video_title.decode('utf-8'),
3564                         'ext':          video_extension.decode('utf-8'),
3565                 }]