git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information about the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title, author and
  34         others. The information is stored in a dictionary which is then
  35         passed to the FileDownloader. The FileDownloader processes this
  36         information possibly downloading the video to the file system, among
  37         other possible outcomes.
  38
  39         The dictionaries must include the following fields:
  40
  41         id:             Video identifier.
  42         url:            Final video URL.
  43         uploader:       Nickname of the video uploader, unescaped.
  44         upload_date:    Video upload date (YYYYMMDD).
  45         title:          Video title, unescaped.
  46         ext:            Video filename extension.
  47
  48         The following fields are optional:
  49
  50         format:         The video format, defaults to ext (used for --get-format)
  51         thumbnail:      Full URL to a video thumbnail image.
  52         description:    One-line video description.
  53         player_url:     SWF Player URL (used for rtmpdump).
  54         subtitles:      The .srt file contents.
  55         urlhandle:              [internal] The urlHandle to be used to download the file,
  56                         like returned by urllib2.urlopen
  57
  58         The fields should all be Unicode strings.
  59
  60         Subclasses of this one should re-define the _real_initialize() and
  61         _real_extract() methods and define a _VALID_URL regexp.
  62         Probably, they should also be added to the list of extractors.
  63
  64         _real_extract() must return a *list* of information dictionaries as
  65         described above.
  66
  67         Finally, the _WORKING attribute should be set to False for broken IEs
  68         in order to warn the users and skip the tests.
  69         """
  70
  71         _ready = False
  72         _downloader = None
  73         _WORKING = True
  74
  75         def __init__(self, downloader=None):
  76                 """Constructor. Receives an optional downloader."""
  77                 self._ready = False
  78                 self.set_downloader(downloader)
  79
  80         def suitable(self, url):
  81                 """Receives a URL and returns True if suitable for this IE."""
  82                 return re.match(self._VALID_URL, url) is not None
  83
  84         def working(self):
  85                 """Getter method for _WORKING."""
  86                 return self._WORKING
  87
  88         def initialize(self):
  89                 """Initializes an instance (authentication, etc)."""
  90                 if not self._ready:
  91                         self._real_initialize()
  92                         self._ready = True
  93
  94         def extract(self, url):
  95                 """Extracts URL information and returns it in list of dicts."""
  96                 self.initialize()
  97                 return self._real_extract(url)
  98
  99         def set_downloader(self, downloader):
 100                 """Sets the downloader for this IE."""
 101                 self._downloader = downloader
 102
 103         def _real_initialize(self):
 104                 """Real initialization process. Redefine in subclasses."""
 105                 pass
 106
 107         def _real_extract(self, url):
 108                 """Real extraction process. Redefine in subclasses."""
 109                 pass
 110
 111
 112 class YoutubeIE(InfoExtractor):
 113         """Information extractor for youtube.com."""
 114
 115         _VALID_URL = r"""^
 116                          (
 117                              (?:https?://)?                                       # http(s):// (optional)
 118                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 119                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 120                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 121                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 122                              (?:                                                  # the various things that can precede the ID:
 123                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 124                                  |(?:                                             # or the v= param in all its forms
 125                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 126                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 127                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 128                                      v=
 129                                  )
 130                              )?                                                   # optional -> youtube.com/xxxx is OK
 131                          )?                                                       # all until now is optional -> you can pass the naked ID
 132                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 133                          (?(1).+)?                                                # if we found the ID, everything can follow
 134                          $"""
 135         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 136         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 137         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 138         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 139         _NETRC_MACHINE = 'youtube'
 140         # Listed in order of quality
 141         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 142         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 143         _video_extensions = {
 144                 '13': '3gp',
 145                 '17': 'mp4',
 146                 '18': 'mp4',
 147                 '22': 'mp4',
 148                 '37': 'mp4',
 149                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 150                 '43': 'webm',
 151                 '44': 'webm',
 152                 '45': 'webm',
 153                 '46': 'webm',
 154         }
 155         _video_dimensions = {
 156                 '5': '240x400',
 157                 '6': '???',
 158                 '13': '???',
 159                 '17': '144x176',
 160                 '18': '360x640',
 161                 '22': '720x1280',
 162                 '34': '360x640',
 163                 '35': '480x854',
 164                 '37': '1080x1920',
 165                 '38': '3072x4096',
 166                 '43': '360x640',
 167                 '44': '480x854',
 168                 '45': '720x1280',
 169                 '46': '1080x1920',
 170         }
 171         IE_NAME = u'youtube'
 172
 173         def suitable(self, url):
 174                 """Receives a URL and returns True if suitable for this IE."""
 175                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 176
 177         def report_lang(self):
 178                 """Report attempt to set language."""
 179                 self._downloader.to_screen(u'[youtube] Setting language')
 180
 181         def report_login(self):
 182                 """Report attempt to log in."""
 183                 self._downloader.to_screen(u'[youtube] Logging in')
 184
 185         def report_age_confirmation(self):
 186                 """Report attempt to confirm age."""
 187                 self._downloader.to_screen(u'[youtube] Confirming age')
 188
 189         def report_video_webpage_download(self, video_id):
 190                 """Report attempt to download video webpage."""
 191                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 192
 193         def report_video_info_webpage_download(self, video_id):
 194                 """Report attempt to download video info webpage."""
 195                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 196
 197         def report_video_subtitles_download(self, video_id):
 198                 """Report attempt to download video info webpage."""
 199                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 200
 201         def report_information_extraction(self, video_id):
 202                 """Report attempt to extract video information."""
 203                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 204
 205         def report_unavailable_format(self, video_id, format):
 206                 """Report extracted video URL."""
 207                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 208
 209         def report_rtmp_download(self):
 210                 """Indicate the download will use the RTMP protocol."""
 211                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 212
 213         def _closed_captions_xml_to_srt(self, xml_string):
 214                 srt = ''
 215                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 216                 # TODO parse xml instead of regex
 217                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 218                         if not dur: dur = '4'
 219                         start = float(start)
 220                         end = start + float(dur)
 221                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 222                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 223                         caption = unescapeHTML(caption)
 224                         caption = unescapeHTML(caption) # double cycle, intentional
 225                         srt += str(n+1) + '\n'
 226                         srt += start + ' --> ' + end + '\n'
 227                         srt += caption + '\n\n'
 228                 return srt
 229
 230         def _print_formats(self, formats):
 231                 print('Available formats:')
 232                 for x in formats:
 233                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 234
 235         def _real_initialize(self):
 236                 if self._downloader is None:
 237                         return
 238
 239                 username = None
 240                 password = None
 241                 downloader_params = self._downloader.params
 242
 243                 # Attempt to use provided username and password or .netrc data
 244                 if downloader_params.get('username', None) is not None:
 245                         username = downloader_params['username']
 246                         password = downloader_params['password']
 247                 elif downloader_params.get('usenetrc', False):
 248                         try:
 249                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 250                                 if info is not None:
 251                                         username = info[0]
 252                                         password = info[2]
 253                                 else:
 254                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 255                         except (IOError, netrc.NetrcParseError), err:
 256                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
 257                                 return
 258
 259                 # Set language
 260                 request = urllib2.Request(self._LANG_URL)
 261                 try:
 262                         self.report_lang()
 263                         urllib2.urlopen(request).read()
 264                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 265                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % u(err))
 266                         return
 267
 268                 # No authentication to be performed
 269                 if username is None:
 270                         return
 271
 272                 # Log in
 273                 login_form = {
 274                                 'current_form': 'loginForm',
 275                                 'next':         '/',
 276                                 'action_login': 'Log In',
 277                                 'username':     username,
 278                                 'password':     password,
 279                                 }
 280                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 281                 try:
 282                         self.report_login()
 283                         login_results = urllib2.urlopen(request).read()
 284                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 285                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 286                                 return
 287                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 288                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
 289                         return
 290
 291                 # Confirm age
 292                 age_form = {
 293                                 'next_url':             '/',
 294                                 'action_confirm':       'Confirm',
 295                                 }
 296                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 297                 try:
 298                         self.report_age_confirmation()
 299                         age_results = urllib2.urlopen(request).read()
 300                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 301                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
 302                         return
 303
 304         def _real_extract(self, url):
 305                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 306                 mobj = re.search(self._NEXT_URL_RE, url)
 307                 if mobj:
 308                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 309
 310                 # Extract video id from URL
 311                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 312                 if mobj is None:
 313                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 314                         return
 315                 video_id = mobj.group(2)
 316
 317                 # Get video webpage
 318                 self.report_video_webpage_download(video_id)
 319                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 320                 try:
 321                         video_webpage = urllib2.urlopen(request).read()
 322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 323                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
 324                         return
 325
 326                 # Attempt to extract SWF player URL
 327                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 328                 if mobj is not None:
 329                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 330                 else:
 331                         player_url = None
 332
 333                 # Get video info
 334                 self.report_video_info_webpage_download(video_id)
 335                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 336                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 337                                         % (video_id, el_type))
 338                         request = urllib2.Request(video_info_url)
 339                         try:
 340                                 video_info_webpage = urllib2.urlopen(request).read()
 341                                 video_info = parse_qs(video_info_webpage)
 342                                 if 'token' in video_info:
 343                                         break
 344                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 345                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
 346                                 return
 347                 if 'token' not in video_info:
 348                         if 'reason' in video_info:
 349                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 350                         else:
 351                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 352                         return
 353
 354                 # Check for "rental" videos
 355                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 356                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 357                         return
 358
 359                 # Start extracting information
 360                 self.report_information_extraction(video_id)
 361
 362                 # uploader
 363                 if 'author' not in video_info:
 364                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 365                         return
 366                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 367
 368                 # title
 369                 if 'title' not in video_info:
 370                         self._downloader.trouble(u'ERROR: unable to extract video title')
 371                         return
 372                 video_title = urllib.unquote_plus(video_info['title'][0])
 373                 video_title = video_title.decode('utf-8')
 374
 375                 # thumbnail image
 376                 if 'thumbnail_url' not in video_info:
 377                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 378                         video_thumbnail = ''
 379                 else:   # don't panic if we can't find it
 380                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 381
 382                 # upload date
 383                 upload_date = None
 384                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 385                 if mobj is not None:
 386                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 387                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 388                         for expression in format_expressions:
 389                                 try:
 390                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 391                                 except:
 392                                         pass
 393
 394                 # description
 395                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 396                 if video_description: video_description = clean_html(video_description)
 397                 else: video_description = ''
 398
 399                 # closed captions
 400                 video_subtitles = None
 401                 if self._downloader.params.get('writesubtitles', False):
 402                         try:
 403                                 self.report_video_subtitles_download(video_id)
 404                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 405                                 try:
 406                                         srt_list = urllib2.urlopen(request).read()
 407                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 408                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
 409                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 410                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 411                                 if not srt_lang_list:
 412                                         raise Trouble(u'WARNING: video has no closed captions')
 413                                 if self._downloader.params.get('subtitleslang', False):
 414                                         srt_lang = self._downloader.params.get('subtitleslang')
 415                                 elif 'en' in srt_lang_list:
 416                                         srt_lang = 'en'
 417                                 else:
 418                                         srt_lang = srt_lang_list.keys()[0]
 419                                 if not srt_lang in srt_lang_list:
 420                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 421                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 422                                 try:
 423                                         srt_xml = urllib2.urlopen(request).read()
 424                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 425                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
 426                                 if not srt_xml:
 427                                         raise Trouble(u'WARNING: unable to download video subtitles')
 428                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 429                         except Trouble as trouble:
 430                                 self._downloader.trouble(trouble[0])
 431
 432                 if 'length_seconds' not in video_info:
 433                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 434                         video_duration = ''
 435                 else:
 436                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
 437
 438                 # token
 439                 video_token = urllib.unquote_plus(video_info['token'][0])
 440
 441                 # Decide which formats to download
 442                 req_format = self._downloader.params.get('format', None)
 443
 444                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 445                         self.report_rtmp_download()
 446                         video_url_list = [(None, video_info['conn'][0])]
 447                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 448                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 449                         url_data = [parse_qs(uds) for uds in url_data_strs]
 450                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 451                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 452
 453                         format_limit = self._downloader.params.get('format_limit', None)
 454                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 455                         if format_limit is not None and format_limit in available_formats:
 456                                 format_list = available_formats[available_formats.index(format_limit):]
 457                         else:
 458                                 format_list = available_formats
 459                         existing_formats = [x for x in format_list if x in url_map]
 460                         if len(existing_formats) == 0:
 461                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 462                                 return
 463                         if self._downloader.params.get('listformats', None):
 464                                 self._print_formats(existing_formats)
 465                                 return
 466                         if req_format is None or req_format == 'best':
 467                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 468                         elif req_format == 'worst':
 469                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 470                         elif req_format in ('-1', 'all'):
 471                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 472                         else:
 473                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 474                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 475                                 req_formats = req_format.split('/')
 476                                 video_url_list = None
 477                                 for rf in req_formats:
 478                                         if rf in url_map:
 479                                                 video_url_list = [(rf, url_map[rf])]
 480                                                 break
 481                                 if video_url_list is None:
 482                                         self._downloader.trouble(u'ERROR: requested format not available')
 483                                         return
 484                 else:
 485                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 486                         return
 487
 488                 results = []
 489                 for format_param, video_real_url in video_url_list:
 490                         # Extension
 491                         video_extension = self._video_extensions.get(format_param, 'flv')
 492
 493                         video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
 494                                                             self._video_dimensions.get(format_param, '???'))
 495
 496                         results.append({
 497                                 'id':           video_id.decode('utf-8'),
 498                                 'url':          video_real_url.decode('utf-8'),
 499                                 'uploader':     video_uploader.decode('utf-8'),
 500                                 'upload_date':  upload_date,
 501                                 'title':        video_title,
 502                                 'ext':          video_extension.decode('utf-8'),
 503                                 'format':       video_format,
 504                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 505                                 'description':  video_description,
 506                                 'player_url':   player_url,
 507                                 'subtitles':    video_subtitles,
 508                                 'duration':             video_duration
 509                         })
 510                 return results
 511
 512
 513 class MetacafeIE(InfoExtractor):
 514         """Information Extractor for metacafe.com."""
 515
 516         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 517         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 518         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 519         IE_NAME = u'metacafe'
 520
 521         def __init__(self, downloader=None):
 522                 InfoExtractor.__init__(self, downloader)
 523
 524         def report_disclaimer(self):
 525                 """Report disclaimer retrieval."""
 526                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 527
 528         def report_age_confirmation(self):
 529                 """Report attempt to confirm age."""
 530                 self._downloader.to_screen(u'[metacafe] Confirming age')
 531
 532         def report_download_webpage(self, video_id):
 533                 """Report webpage download."""
 534                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 535
 536         def report_extraction(self, video_id):
 537                 """Report information extraction."""
 538                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 539
 540         def _real_initialize(self):
 541                 # Retrieve disclaimer
 542                 request = urllib2.Request(self._DISCLAIMER)
 543                 try:
 544                         self.report_disclaimer()
 545                         disclaimer = urllib2.urlopen(request).read()
 546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 547                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % u(err))
 548                         return
 549
 550                 # Confirm age
 551                 disclaimer_form = {
 552                         'filters': '0',
 553                         'submit': "Continue - I'm over 18",
 554                         }
 555                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 556                 try:
 557                         self.report_age_confirmation()
 558                         disclaimer = urllib2.urlopen(request).read()
 559                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 560                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
 561                         return
 562
 563         def _real_extract(self, url):
 564                 # Extract id and simplified title from URL
 565                 mobj = re.match(self._VALID_URL, url)
 566                 if mobj is None:
 567                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 568                         return
 569
 570                 video_id = mobj.group(1)
 571
 572                 # Check if video comes from YouTube
 573                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 574                 if mobj2 is not None:
 575                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 576                         return
 577
 578                 # Retrieve video webpage to extract further information
 579                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 580                 try:
 581                         self.report_download_webpage(video_id)
 582                         webpage = urllib2.urlopen(request).read()
 583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 584                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
 585                         return
 586
 587                 # Extract URL, uploader and title from webpage
 588                 self.report_extraction(video_id)
 589                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 590                 if mobj is not None:
 591                         mediaURL = urllib.unquote(mobj.group(1))
 592                         video_extension = mediaURL[-3:]
 593
 594                         # Extract gdaKey if available
 595                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 596                         if mobj is None:
 597                                 video_url = mediaURL
 598                         else:
 599                                 gdaKey = mobj.group(1)
 600                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 601                 else:
 602                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 603                         if mobj is None:
 604                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 605                                 return
 606                         vardict = parse_qs(mobj.group(1))
 607                         if 'mediaData' not in vardict:
 608                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 609                                 return
 610                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 611                         if mobj is None:
 612                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 613                                 return
 614                         mediaURL = mobj.group(1).replace('\\/', '/')
 615                         video_extension = mediaURL[-3:]
 616                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 617
 618                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 619                 if mobj is None:
 620                         self._downloader.trouble(u'ERROR: unable to extract title')
 621                         return
 622                 video_title = mobj.group(1).decode('utf-8')
 623
 624                 mobj = re.search(r'submitter=(.*?);', webpage)
 625                 if mobj is None:
 626                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 627                         return
 628                 video_uploader = mobj.group(1)
 629
 630                 return [{
 631                         'id':           video_id.decode('utf-8'),
 632                         'url':          video_url.decode('utf-8'),
 633                         'uploader':     video_uploader.decode('utf-8'),
 634                         'upload_date':  None,
 635                         'title':        video_title,
 636                         'ext':          video_extension.decode('utf-8'),
 637                 }]
 638
 639
 640 class DailymotionIE(InfoExtractor):
 641         """Information Extractor for Dailymotion"""
 642
 643         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 644         IE_NAME = u'dailymotion'
 645
 646         def __init__(self, downloader=None):
 647                 InfoExtractor.__init__(self, downloader)
 648
 649         def report_download_webpage(self, video_id):
 650                 """Report webpage download."""
 651                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 652
 653         def report_extraction(self, video_id):
 654                 """Report information extraction."""
 655                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 656
 657         def _real_extract(self, url):
 658                 # Extract id and simplified title from URL
 659                 mobj = re.match(self._VALID_URL, url)
 660                 if mobj is None:
 661                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 662                         return
 663
 664                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 665
 666                 video_extension = 'mp4'
 667
 668                 # Retrieve video webpage to extract further information
 669                 request = urllib2.Request(url)
 670                 request.add_header('Cookie', 'family_filter=off')
 671                 try:
 672                         self.report_download_webpage(video_id)
 673                         webpage = urllib2.urlopen(request).read()
 674                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 675                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
 676                         return
 677
 678                 # Extract URL, uploader and title from webpage
 679                 self.report_extraction(video_id)
 680                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 681                 if mobj is None:
 682                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 683                         return
 684                 flashvars = urllib.unquote(mobj.group(1))
 685
 686                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 687                         if key in flashvars:
 688                                 max_quality = key
 689                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 690                                 break
 691                 else:
 692                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 693                         return
 694
 695                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 696                 if mobj is None:
 697                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 698                         return
 699
 700                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 701
 702                 # TODO: support choosing qualities
 703
 704                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 705                 if mobj is None:
 706                         self._downloader.trouble(u'ERROR: unable to extract title')
 707                         return
 708                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 709
 710                 video_uploader = None
 711                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 712                 if mobj is None:
 713                         # lookin for official user
 714                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 715                         if mobj_official is None:
 716                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 717                         else:
 718                                 video_uploader = mobj_official.group(1)
 719                 else:
 720                         video_uploader = mobj.group(1)
 721
 722                 video_upload_date = None
 723                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 724                 if mobj is not None:
 725                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 726
 727                 return [{
 728                         'id':           video_id.decode('utf-8'),
 729                         'url':          video_url.decode('utf-8'),
 730                         'uploader':     video_uploader.decode('utf-8'),
 731                         'upload_date':  video_upload_date,
 732                         'title':        video_title,
 733                         'ext':          video_extension.decode('utf-8'),
 734                 }]
 735
 736
 737 class GoogleIE(InfoExtractor):
 738         """Information extractor for video.google.com."""
 739
 740         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 741         IE_NAME = u'video.google'
 742
 743         def __init__(self, downloader=None):
 744                 InfoExtractor.__init__(self, downloader)
 745
 746         def report_download_webpage(self, video_id):
 747                 """Report webpage download."""
 748                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 749
 750         def report_extraction(self, video_id):
 751                 """Report information extraction."""
 752                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 753
 754         def _real_extract(self, url):
 755                 # Extract id from URL
 756                 mobj = re.match(self._VALID_URL, url)
 757                 if mobj is None:
 758                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 759                         return
 760
 761                 video_id = mobj.group(1)
 762
 763                 video_extension = 'mp4'
 764
 765                 # Retrieve video webpage to extract further information
 766                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 767                 try:
 768                         self.report_download_webpage(video_id)
 769                         webpage = urllib2.urlopen(request).read()
 770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 771                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 772                         return
 773
 774                 # Extract URL, uploader, and title from webpage
 775                 self.report_extraction(video_id)
 776                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 777                 if mobj is None:
 778                         video_extension = 'flv'
 779                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 780                 if mobj is None:
 781                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 782                         return
 783                 mediaURL = urllib.unquote(mobj.group(1))
 784                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 785                 mediaURL = mediaURL.replace('\\x26', '\x26')
 786
 787                 video_url = mediaURL
 788
 789                 mobj = re.search(r'<title>(.*)</title>', webpage)
 790                 if mobj is None:
 791                         self._downloader.trouble(u'ERROR: unable to extract title')
 792                         return
 793                 video_title = mobj.group(1).decode('utf-8')
 794
 795                 # Extract video description
 796                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 797                 if mobj is None:
 798                         self._downloader.trouble(u'ERROR: unable to extract video description')
 799                         return
 800                 video_description = mobj.group(1).decode('utf-8')
 801                 if not video_description:
 802                         video_description = 'No description available.'
 803
 804                 # Extract video thumbnail
 805                 if self._downloader.params.get('forcethumbnail', False):
 806                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 807                         try:
 808                                 webpage = urllib2.urlopen(request).read()
 809                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 810                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 811                                 return
 812                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 813                         if mobj is None:
 814                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 815                                 return
 816                         video_thumbnail = mobj.group(1)
 817                 else:   # we need something to pass to process_info
 818                         video_thumbnail = ''
 819
 820                 return [{
 821                         'id':           video_id.decode('utf-8'),
 822                         'url':          video_url.decode('utf-8'),
 823                         'uploader':     None,
 824                         'upload_date':  None,
 825                         'title':        video_title,
 826                         'ext':          video_extension.decode('utf-8'),
 827                 }]
 828
 829
 830 class PhotobucketIE(InfoExtractor):
 831         """Information extractor for photobucket.com."""
 832
 833         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 834         IE_NAME = u'photobucket'
 835
 836         def __init__(self, downloader=None):
 837                 InfoExtractor.__init__(self, downloader)
 838
 839         def report_download_webpage(self, video_id):
 840                 """Report webpage download."""
 841                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 842
 843         def report_extraction(self, video_id):
 844                 """Report information extraction."""
 845                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 846
 847         def _real_extract(self, url):
 848                 # Extract id from URL
 849                 mobj = re.match(self._VALID_URL, url)
 850                 if mobj is None:
 851                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 852                         return
 853
 854                 video_id = mobj.group(1)
 855
 856                 video_extension = 'flv'
 857
 858                 # Retrieve video webpage to extract further information
 859                 request = urllib2.Request(url)
 860                 try:
 861                         self.report_download_webpage(video_id)
 862                         webpage = urllib2.urlopen(request).read()
 863                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 864                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 865                         return
 866
 867                 # Extract URL, uploader, and title from webpage
 868                 self.report_extraction(video_id)
 869                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 870                 if mobj is None:
 871                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 872                         return
 873                 mediaURL = urllib.unquote(mobj.group(1))
 874
 875                 video_url = mediaURL
 876
 877                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 878                 if mobj is None:
 879                         self._downloader.trouble(u'ERROR: unable to extract title')
 880                         return
 881                 video_title = mobj.group(1).decode('utf-8')
 882
 883                 video_uploader = mobj.group(2).decode('utf-8')
 884
 885                 return [{
 886                         'id':           video_id.decode('utf-8'),
 887                         'url':          video_url.decode('utf-8'),
 888                         'uploader':     video_uploader,
 889                         'upload_date':  None,
 890                         'title':        video_title,
 891                         'ext':          video_extension.decode('utf-8'),
 892                 }]
 893
 894
 895 class YahooIE(InfoExtractor):
 896         """Information extractor for video.yahoo.com."""
 897
 898         # _VALID_URL matches all Yahoo! Video URLs
 899         # _VPAGE_URL matches only the extractable '/watch/' URLs
 900         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 901         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 902         IE_NAME = u'video.yahoo'
 903
 904         def __init__(self, downloader=None):
 905                 InfoExtractor.__init__(self, downloader)
 906
 907         def report_download_webpage(self, video_id):
 908                 """Report webpage download."""
 909                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 910
 911         def report_extraction(self, video_id):
 912                 """Report information extraction."""
 913                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 914
 915         def _real_extract(self, url, new_video=True):
 916                 # Extract ID from URL
 917                 mobj = re.match(self._VALID_URL, url)
 918                 if mobj is None:
 919                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 920                         return
 921
 922                 video_id = mobj.group(2)
 923                 video_extension = 'flv'
 924
 925                 # Rewrite valid but non-extractable URLs as
 926                 # extractable English language /watch/ URLs
 927                 if re.match(self._VPAGE_URL, url) is None:
 928                         request = urllib2.Request(url)
 929                         try:
 930                                 webpage = urllib2.urlopen(request).read()
 931                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 932                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 933                                 return
 934
 935                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 936                         if mobj is None:
 937                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 938                                 return
 939                         yahoo_id = mobj.group(1)
 940
 941                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 942                         if mobj is None:
 943                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 944                                 return
 945                         yahoo_vid = mobj.group(1)
 946
 947                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 948                         return self._real_extract(url, new_video=False)
 949
 950                 # Retrieve video webpage to extract further information
 951                 request = urllib2.Request(url)
 952                 try:
 953                         self.report_download_webpage(video_id)
 954                         webpage = urllib2.urlopen(request).read()
 955                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 956                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 957                         return
 958
 959                 # Extract uploader and title from webpage
 960                 self.report_extraction(video_id)
 961                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 962                 if mobj is None:
 963                         self._downloader.trouble(u'ERROR: unable to extract video title')
 964                         return
 965                 video_title = mobj.group(1).decode('utf-8')
 966
 967                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 968                 if mobj is None:
 969                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 970                         return
 971                 video_uploader = mobj.group(1).decode('utf-8')
 972
 973                 # Extract video thumbnail
 974                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 975                 if mobj is None:
 976                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 977                         return
 978                 video_thumbnail = mobj.group(1).decode('utf-8')
 979
 980                 # Extract video description
 981                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 982                 if mobj is None:
 983                         self._downloader.trouble(u'ERROR: unable to extract video description')
 984                         return
 985                 video_description = mobj.group(1).decode('utf-8')
 986                 if not video_description:
 987                         video_description = 'No description available.'
 988
 989                 # Extract video height and width
 990                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 991                 if mobj is None:
 992                         self._downloader.trouble(u'ERROR: unable to extract video height')
 993                         return
 994                 yv_video_height = mobj.group(1)
 995
 996                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 997                 if mobj is None:
 998                         self._downloader.trouble(u'ERROR: unable to extract video width')
 999                         return
1000                 yv_video_width = mobj.group(1)
1001
1002                 # Retrieve video playlist to extract media URL
1003                 # I'm not completely sure what all these options are, but we
1004                 # seem to need most of them, otherwise the server sends a 401.
1005                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1006                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1007                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1008                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1009                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1010                 try:
1011                         self.report_download_webpage(video_id)
1012                         webpage = urllib2.urlopen(request).read()
1013                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1014                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1015                         return
1016
1017                 # Extract media URL from playlist XML
1018                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1019                 if mobj is None:
1020                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1021                         return
1022                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1023                 video_url = unescapeHTML(video_url)
1024
1025                 return [{
1026                         'id':           video_id.decode('utf-8'),
1027                         'url':          video_url,
1028                         'uploader':     video_uploader,
1029                         'upload_date':  None,
1030                         'title':        video_title,
1031                         'ext':          video_extension.decode('utf-8'),
1032                         'thumbnail':    video_thumbnail.decode('utf-8'),
1033                         'description':  video_description,
1034                 }]
1035
1036
1037 class VimeoIE(InfoExtractor):
1038         """Information extractor for vimeo.com."""
1039
1040         # _VALID_URL matches Vimeo URLs
1041         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1042         IE_NAME = u'vimeo'
1043
1044         def __init__(self, downloader=None):
1045                 InfoExtractor.__init__(self, downloader)
1046
1047         def report_download_webpage(self, video_id):
1048                 """Report webpage download."""
1049                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1050
1051         def report_extraction(self, video_id):
1052                 """Report information extraction."""
1053                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1054
1055         def _real_extract(self, url, new_video=True):
1056                 # Extract ID from URL
1057                 mobj = re.match(self._VALID_URL, url)
1058                 if mobj is None:
1059                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1060                         return
1061
1062                 video_id = mobj.group(1)
1063
1064                 # Retrieve video webpage to extract further information
1065                 request = urllib2.Request(url, None, std_headers)
1066                 try:
1067                         self.report_download_webpage(video_id)
1068                         webpage = urllib2.urlopen(request).read()
1069                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1070                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1071                         return
1072
1073                 # Now we begin extracting as much information as we can from what we
1074                 # retrieved. First we extract the information common to all extractors,
1075                 # and latter we extract those that are Vimeo specific.
1076                 self.report_extraction(video_id)
1077
1078                 # Extract the config JSON
1079                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1080                 try:
1081                         config = json.loads(config)
1082                 except:
1083                         self._downloader.trouble(u'ERROR: unable to extract info section')
1084                         return
1085
1086                 # Extract title
1087                 video_title = config["video"]["title"]
1088
1089                 # Extract uploader
1090                 video_uploader = config["video"]["owner"]["name"]
1091
1092                 # Extract video thumbnail
1093                 video_thumbnail = config["video"]["thumbnail"]
1094
1095                 # Extract video description
1096                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1097                 if video_description: video_description = clean_html(video_description)
1098                 else: video_description = ''
1099
1100                 # Extract upload date
1101                 video_upload_date = None
1102                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1103                 if mobj is not None:
1104                         video_upload_date = mobj.group(1)
1105
1106                 # Vimeo specific: extract request signature and timestamp
1107                 sig = config['request']['signature']
1108                 timestamp = config['request']['timestamp']
1109
1110                 # Vimeo specific: extract video codec and quality information
1111                 # First consider quality, then codecs, then take everything
1112                 # TODO bind to format param
1113                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1114                 files = { 'hd': [], 'sd': [], 'other': []}
1115                 for codec_name, codec_extension in codecs:
1116                         if codec_name in config["video"]["files"]:
1117                                 if 'hd' in config["video"]["files"][codec_name]:
1118                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1119                                 elif 'sd' in config["video"]["files"][codec_name]:
1120                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1121                                 else:
1122                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1123
1124                 for quality in ('hd', 'sd', 'other'):
1125                         if len(files[quality]) > 0:
1126                                 video_quality = files[quality][0][2]
1127                                 video_codec = files[quality][0][0]
1128                                 video_extension = files[quality][0][1]
1129                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1130                                 break
1131                 else:
1132                         self._downloader.trouble(u'ERROR: no known codec found')
1133                         return
1134
1135                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1136                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1137
1138                 return [{
1139                         'id':           video_id,
1140                         'url':          video_url,
1141                         'uploader':     video_uploader,
1142                         'upload_date':  video_upload_date,
1143                         'title':        video_title,
1144                         'ext':          video_extension,
1145                         'thumbnail':    video_thumbnail,
1146                         'description':  video_description,
1147                 }]
1148
1149
1150 class ArteTvIE(InfoExtractor):
1151         """arte.tv information extractor."""
1152
1153         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1154         _LIVE_URL = r'index-[0-9]+\.html$'
1155
1156         IE_NAME = u'arte.tv'
1157
1158         def __init__(self, downloader=None):
1159                 InfoExtractor.__init__(self, downloader)
1160
1161         def report_download_webpage(self, video_id):
1162                 """Report webpage download."""
1163                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1164
1165         def report_extraction(self, video_id):
1166                 """Report information extraction."""
1167                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1168
1169         def fetch_webpage(self, url):
1170                 self._downloader.increment_downloads()
1171                 request = urllib2.Request(url)
1172                 try:
1173                         self.report_download_webpage(url)
1174                         webpage = urllib2.urlopen(request).read()
1175                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1177                         return
1178                 except ValueError, err:
1179                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180                         return
1181                 return webpage
1182
1183         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184                 page = self.fetch_webpage(url)
1185                 mobj = re.search(regex, page, regexFlags)
1186                 info = {}
1187
1188                 if mobj is None:
1189                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190                         return
1191
1192                 for (i, key, err) in matchTuples:
1193                         if mobj.group(i) is None:
1194                                 self._downloader.trouble(err)
1195                                 return
1196                         else:
1197                                 info[key] = mobj.group(i)
1198
1199                 return info
1200
1201         def extractLiveStream(self, url):
1202                 video_lang = url.split('/')[-4]
1203                 info = self.grep_webpage(
1204                         url,
1205                         r'src="(.*?/videothek_js.*?\.js)',
1206                         0,
1207                         [
1208                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1209                         ]
1210                 )
1211                 http_host = url.split('/')[2]
1212                 next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
1213                 info = self.grep_webpage(
1214                         next_url,
1215                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216                                 '(http://.*?\.swf).*?' +
1217                                 '(rtmp://.*?)\'',
1218                         re.DOTALL,
1219                         [
1220                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1221                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1222                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1223                         ]
1224                 )
1225                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1226
1227         def extractPlus7Stream(self, url):
1228                 video_lang = url.split('/')[-3]
1229                 info = self.grep_webpage(
1230                         url,
1231                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232                         0,
1233                         [
1234                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1235                         ]
1236                 )
1237                 next_url = urllib.unquote(info.get('url'))
1238                 info = self.grep_webpage(
1239                         next_url,
1240                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241                         0,
1242                         [
1243                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1244                         ]
1245                 )
1246                 next_url = urllib.unquote(info.get('url'))
1247
1248                 info = self.grep_webpage(
1249                         next_url,
1250                         r'<video id="(.*?)".*?>.*?' +
1251                                 '<name>(.*?)</name>.*?' +
1252                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1253                                 '<url quality="hd">(.*?)</url>',
1254                         re.DOTALL,
1255                         [
1256                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1257                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1258                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1259                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1260                         ]
1261                 )
1262
1263                 return {
1264                         'id':           info.get('id'),
1265                         'url':          urllib.unquote(info.get('url')),
1266                         'uploader':     u'arte.tv',
1267                         'upload_date':  info.get('date'),
1268                         'title':        info.get('title'),
1269                         'ext':          u'mp4',
1270                         'format':       u'NA',
1271                         'player_url':   None,
1272                 }
1273
1274         def _real_extract(self, url):
1275                 video_id = url.split('/')[-1]
1276                 self.report_extraction(video_id)
1277
1278                 if re.search(self._LIVE_URL, video_id) is not None:
1279                         self.extractLiveStream(url)
1280                         return
1281                 else:
1282                         info = self.extractPlus7Stream(url)
1283
1284                 return [info]
1285
1286
1287 class GenericIE(InfoExtractor):
1288         """Generic last-resort information extractor."""
1289
1290         _VALID_URL = r'.*'
1291         IE_NAME = u'generic'
1292
1293         def __init__(self, downloader=None):
1294                 InfoExtractor.__init__(self, downloader)
1295
1296         def report_download_webpage(self, video_id):
1297                 """Report webpage download."""
1298                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1299                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1300
1301         def report_extraction(self, video_id):
1302                 """Report information extraction."""
1303                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1304
1305         def report_following_redirect(self, new_url):
1306                 """Report information extraction."""
1307                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309         def _test_redirect(self, url):
1310                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1311                 class HeadRequest(urllib2.Request):
1312                         def get_method(self):
1313                                 return "HEAD"
1314
1315                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1316                         """
1317                         Subclass the HTTPRedirectHandler to make it use our
1318                         HeadRequest also on the redirected URL
1319                         """
1320                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1321                                 if code in (301, 302, 303, 307):
1322                                         newurl = newurl.replace(' ', '%20')
1323                                         newheaders = dict((k,v) for k,v in req.headers.items()
1324                                                                           if k.lower() not in ("content-length", "content-type"))
1325                                         return HeadRequest(newurl,
1326                                                                            headers=newheaders,
1327                                                                            origin_req_host=req.get_origin_req_host(),
1328                                                                            unverifiable=True)
1329                                 else:
1330                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332                 class HTTPMethodFallback(urllib2.BaseHandler):
1333                         """
1334                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1335                         """
1336                         def http_error_405(self, req, fp, code, msg, headers):
1337                                 fp.read()
1338                                 fp.close()
1339
1340                                 newheaders = dict((k,v) for k,v in req.headers.items()
1341                                                                   if k.lower() not in ("content-length", "content-type"))
1342                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1343                                                                                                  headers=newheaders,
1344                                                                                                  origin_req_host=req.get_origin_req_host(),
1345                                                                                                  unverifiable=True))
1346
1347                 # Build our opener
1348                 opener = urllib2.OpenerDirector()
1349                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1350                                                 HTTPMethodFallback, HEADRedirectHandler,
1351                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1352                         opener.add_handler(handler())
1353
1354                 response = opener.open(HeadRequest(url))
1355                 new_url = response.geturl()
1356
1357                 if url == new_url: return False
1358
1359                 self.report_following_redirect(new_url)
1360                 self._downloader.download([new_url])
1361                 return True
1362
1363         def _real_extract(self, url):
1364                 if self._test_redirect(url): return
1365
1366                 video_id = url.split('/')[-1]
1367                 request = urllib2.Request(url)
1368                 try:
1369                         self.report_download_webpage(video_id)
1370                         webpage = urllib2.urlopen(request).read()
1371                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1372                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1373                         return
1374                 except ValueError, err:
1375                         # since this is the last-resort InfoExtractor, if
1376                         # this error is thrown, it'll be thrown here
1377                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378                         return
1379
1380                 self.report_extraction(video_id)
1381                 # Start with something easy: JW Player in SWFObject
1382                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1383                 if mobj is None:
1384                         # Broaden the search a little bit
1385                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1386                 if mobj is None:
1387                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1388                         return
1389
1390                 # It's possible that one of the regexes
1391                 # matched, but returned an empty group:
1392                 if mobj.group(1) is None:
1393                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1394                         return
1395
1396                 video_url = urllib.unquote(mobj.group(1))
1397                 video_id = os.path.basename(video_url)
1398
1399                 # here's a fun little line of code for you:
1400                 video_extension = os.path.splitext(video_id)[1][1:]
1401                 video_id = os.path.splitext(video_id)[0]
1402
1403                 # it's tempting to parse this further, but you would
1404                 # have to take into account all the variations like
1405                 #   Video Title - Site Name
1406                 #   Site Name | Video Title
1407                 #   Video Title - Tagline | Site Name
1408                 # and so on and so forth; it's just not practical
1409                 mobj = re.search(r'<title>(.*)</title>', webpage)
1410                 if mobj is None:
1411                         self._downloader.trouble(u'ERROR: unable to extract title')
1412                         return
1413                 video_title = mobj.group(1).decode('utf-8')
1414
1415                 # video uploader is domain name
1416                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1417                 if mobj is None:
1418                         self._downloader.trouble(u'ERROR: unable to extract title')
1419                         return
1420                 video_uploader = mobj.group(1).decode('utf-8')
1421
1422                 return [{
1423                         'id':           video_id.decode('utf-8'),
1424                         'url':          video_url.decode('utf-8'),
1425                         'uploader':     video_uploader,
1426                         'upload_date':  None,
1427                         'title':        video_title,
1428                         'ext':          video_extension.decode('utf-8'),
1429                 }]
1430
1431
1432 class YoutubeSearchIE(InfoExtractor):
1433         """Information Extractor for YouTube search queries."""
1434         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1435         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1436         _max_youtube_results = 1000
1437         IE_NAME = u'youtube:search'
1438
1439         def __init__(self, downloader=None):
1440                 InfoExtractor.__init__(self, downloader)
1441
1442         def report_download_page(self, query, pagenum):
1443                 """Report attempt to download search page with given number."""
1444                 query = query.decode(preferredencoding())
1445                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1446
1447         def _real_extract(self, query):
1448                 mobj = re.match(self._VALID_URL, query)
1449                 if mobj is None:
1450                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1451                         return
1452
1453                 prefix, query = query.split(':')
1454                 prefix = prefix[8:]
1455                 query = query.encode('utf-8')
1456                 if prefix == '':
1457                         self._download_n_results(query, 1)
1458                         return
1459                 elif prefix == 'all':
1460                         self._download_n_results(query, self._max_youtube_results)
1461                         return
1462                 else:
1463                         try:
1464                                 n = int(prefix)
1465                                 if n <= 0:
1466                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1467                                         return
1468                                 elif n > self._max_youtube_results:
1469                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1470                                         n = self._max_youtube_results
1471                                 self._download_n_results(query, n)
1472                                 return
1473                         except ValueError: # parsing prefix as integer fails
1474                                 self._download_n_results(query, 1)
1475                                 return
1476
1477         def _download_n_results(self, query, n):
1478                 """Downloads a specified number of results for a query"""
1479
1480                 video_ids = []
1481                 pagenum = 0
1482                 limit = n
1483
1484                 while (50 * pagenum) < limit:
1485                         self.report_download_page(query, pagenum+1)
1486                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1487                         request = urllib2.Request(result_url)
1488                         try:
1489                                 data = urllib2.urlopen(request).read()
1490                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % u(err))
1492                                 return
1493                         api_response = json.loads(data)['data']
1494
1495                         new_ids = list(video['id'] for video in api_response['items'])
1496                         video_ids += new_ids
1497
1498                         limit = min(n, api_response['totalItems'])
1499                         pagenum += 1
1500
1501                 if len(video_ids) > n:
1502                         video_ids = video_ids[:n]
1503                 for id in video_ids:
1504                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1505                 return
1506
1507
1508 class GoogleSearchIE(InfoExtractor):
1509         """Information Extractor for Google Video search queries."""
1510         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1511         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1512         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1513         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1514         _max_google_results = 1000
1515         IE_NAME = u'video.google:search'
1516
1517         def __init__(self, downloader=None):
1518                 InfoExtractor.__init__(self, downloader)
1519
1520         def report_download_page(self, query, pagenum):
1521                 """Report attempt to download playlist page with given number."""
1522                 query = query.decode(preferredencoding())
1523                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1524
1525         def _real_extract(self, query):
1526                 mobj = re.match(self._VALID_URL, query)
1527                 if mobj is None:
1528                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1529                         return
1530
1531                 prefix, query = query.split(':')
1532                 prefix = prefix[8:]
1533                 query = query.encode('utf-8')
1534                 if prefix == '':
1535                         self._download_n_results(query, 1)
1536                         return
1537                 elif prefix == 'all':
1538                         self._download_n_results(query, self._max_google_results)
1539                         return
1540                 else:
1541                         try:
1542                                 n = int(prefix)
1543                                 if n <= 0:
1544                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1545                                         return
1546                                 elif n > self._max_google_results:
1547                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1548                                         n = self._max_google_results
1549                                 self._download_n_results(query, n)
1550                                 return
1551                         except ValueError: # parsing prefix as integer fails
1552                                 self._download_n_results(query, 1)
1553                                 return
1554
1555         def _download_n_results(self, query, n):
1556                 """Downloads a specified number of results for a query"""
1557
1558                 video_ids = []
1559                 pagenum = 0
1560
1561                 while True:
1562                         self.report_download_page(query, pagenum)
1563                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1564                         request = urllib2.Request(result_url)
1565                         try:
1566                                 page = urllib2.urlopen(request).read()
1567                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1568                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1569                                 return
1570
1571                         # Extract video identifiers
1572                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1573                                 video_id = mobj.group(1)
1574                                 if video_id not in video_ids:
1575                                         video_ids.append(video_id)
1576                                         if len(video_ids) == n:
1577                                                 # Specified n videos reached
1578                                                 for id in video_ids:
1579                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580                                                 return
1581
1582                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1583                                 for id in video_ids:
1584                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1585                                 return
1586
1587                         pagenum = pagenum + 1
1588
1589
1590 class YahooSearchIE(InfoExtractor):
1591         """Information Extractor for Yahoo! Video search queries."""
1592         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1593         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1594         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1595         _MORE_PAGES_INDICATOR = r'\s*Next'
1596         _max_yahoo_results = 1000
1597         IE_NAME = u'video.yahoo:search'
1598
1599         def __init__(self, downloader=None):
1600                 InfoExtractor.__init__(self, downloader)
1601
1602         def report_download_page(self, query, pagenum):
1603                 """Report attempt to download playlist page with given number."""
1604                 query = query.decode(preferredencoding())
1605                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1606
1607         def _real_extract(self, query):
1608                 mobj = re.match(self._VALID_URL, query)
1609                 if mobj is None:
1610                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1611                         return
1612
1613                 prefix, query = query.split(':')
1614                 prefix = prefix[8:]
1615                 query = query.encode('utf-8')
1616                 if prefix == '':
1617                         self._download_n_results(query, 1)
1618                         return
1619                 elif prefix == 'all':
1620                         self._download_n_results(query, self._max_yahoo_results)
1621                         return
1622                 else:
1623                         try:
1624                                 n = int(prefix)
1625                                 if n <= 0:
1626                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1627                                         return
1628                                 elif n > self._max_yahoo_results:
1629                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1630                                         n = self._max_yahoo_results
1631                                 self._download_n_results(query, n)
1632                                 return
1633                         except ValueError: # parsing prefix as integer fails
1634                                 self._download_n_results(query, 1)
1635                                 return
1636
1637         def _download_n_results(self, query, n):
1638                 """Downloads a specified number of results for a query"""
1639
1640                 video_ids = []
1641                 already_seen = set()
1642                 pagenum = 1
1643
1644                 while True:
1645                         self.report_download_page(query, pagenum)
1646                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1647                         request = urllib2.Request(result_url)
1648                         try:
1649                                 page = urllib2.urlopen(request).read()
1650                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1652                                 return
1653
1654                         # Extract video identifiers
1655                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656                                 video_id = mobj.group(1)
1657                                 if video_id not in already_seen:
1658                                         video_ids.append(video_id)
1659                                         already_seen.add(video_id)
1660                                         if len(video_ids) == n:
1661                                                 # Specified n videos reached
1662                                                 for id in video_ids:
1663                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664                                                 return
1665
1666                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1667                                 for id in video_ids:
1668                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1669                                 return
1670
1671                         pagenum = pagenum + 1
1672
1673
1674 class YoutubePlaylistIE(InfoExtractor):
1675         """Information Extractor for YouTube playlists."""
1676
1677         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1678         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1679         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1680         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1681         IE_NAME = u'youtube:playlist'
1682
1683         def __init__(self, downloader=None):
1684                 InfoExtractor.__init__(self, downloader)
1685
1686         def report_download_page(self, playlist_id, pagenum):
1687                 """Report attempt to download playlist page with given number."""
1688                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1689
1690         def _real_extract(self, url):
1691                 # Extract playlist id
1692                 mobj = re.match(self._VALID_URL, url)
1693                 if mobj is None:
1694                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1695                         return
1696
1697                 # Single video case
1698                 if mobj.group(3) is not None:
1699                         self._downloader.download([mobj.group(3)])
1700                         return
1701
1702                 # Download playlist pages
1703                 # prefix is 'p' as default for playlists but there are other types that need extra care
1704                 playlist_prefix = mobj.group(1)
1705                 if playlist_prefix == 'a':
1706                         playlist_access = 'artist'
1707                 else:
1708                         playlist_prefix = 'p'
1709                         playlist_access = 'view_play_list'
1710                 playlist_id = mobj.group(2)
1711                 video_ids = []
1712                 pagenum = 1
1713
1714                 while True:
1715                         self.report_download_page(playlist_id, pagenum)
1716                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1717                         request = urllib2.Request(url)
1718                         try:
1719                                 page = urllib2.urlopen(request).read()
1720                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1721                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1722                                 return
1723
1724                         # Extract video identifiers
1725                         ids_in_page = []
1726                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1727                                 if mobj.group(1) not in ids_in_page:
1728                                         ids_in_page.append(mobj.group(1))
1729                         video_ids.extend(ids_in_page)
1730
1731                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1732                                 break
1733                         pagenum = pagenum + 1
1734
1735                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1736                 playlistend = self._downloader.params.get('playlistend', -1)
1737                 if playlistend == -1:
1738                         video_ids = video_ids[playliststart:]
1739                 else:
1740                         video_ids = video_ids[playliststart:playlistend]
1741
1742                 for id in video_ids:
1743                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1744                 return
1745
1746
1747 class YoutubeChannelIE(InfoExtractor):
1748         """Information Extractor for YouTube channels."""
1749
1750         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1751         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1752         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1753         IE_NAME = u'youtube:channel'
1754
1755         def report_download_page(self, channel_id, pagenum):
1756                 """Report attempt to download channel page with given number."""
1757                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1758
1759         def _real_extract(self, url):
1760                 # Extract channel id
1761                 mobj = re.match(self._VALID_URL, url)
1762                 if mobj is None:
1763                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1764                         return
1765
1766                 # Download channel pages
1767                 channel_id = mobj.group(1)
1768                 video_ids = []
1769                 pagenum = 1
1770
1771                 while True:
1772                         self.report_download_page(channel_id, pagenum)
1773                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1774                         request = urllib2.Request(url)
1775                         try:
1776                                 page = urllib2.urlopen(request).read()
1777                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1779                                 return
1780
1781                         # Extract video identifiers
1782                         ids_in_page = []
1783                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1784                                 if mobj.group(1) not in ids_in_page:
1785                                         ids_in_page.append(mobj.group(1))
1786                         video_ids.extend(ids_in_page)
1787
1788                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1789                                 break
1790                         pagenum = pagenum + 1
1791
1792                 for id in video_ids:
1793                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1794                 return
1795
1796
1797 class YoutubeUserIE(InfoExtractor):
1798         """Information Extractor for YouTube users."""
1799
1800         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1801         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1802         _GDATA_PAGE_SIZE = 50
1803         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1804         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1805         IE_NAME = u'youtube:user'
1806
1807         def __init__(self, downloader=None):
1808                 InfoExtractor.__init__(self, downloader)
1809
1810         def report_download_page(self, username, start_index):
1811                 """Report attempt to download user page."""
1812                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1813                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1814
1815         def _real_extract(self, url):
1816                 # Extract username
1817                 mobj = re.match(self._VALID_URL, url)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1820                         return
1821
1822                 username = mobj.group(1)
1823
1824                 # Download video ids using YouTube Data API. Result size per
1825                 # query is limited (currently to 50 videos) so we need to query
1826                 # page by page until there are no video ids - it means we got
1827                 # all of them.
1828
1829                 video_ids = []
1830                 pagenum = 0
1831
1832                 while True:
1833                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1834                         self.report_download_page(username, start_index)
1835
1836                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1837
1838                         try:
1839                                 page = urllib2.urlopen(request).read()
1840                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1842                                 return
1843
1844                         # Extract video identifiers
1845                         ids_in_page = []
1846
1847                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1848                                 if mobj.group(1) not in ids_in_page:
1849                                         ids_in_page.append(mobj.group(1))
1850
1851                         video_ids.extend(ids_in_page)
1852
1853                         # A little optimization - if current page is not
1854                         # "full", ie. does not contain PAGE_SIZE video ids then
1855                         # we can assume that this page is the last one - there
1856                         # are no more ids on further pages - no need to query
1857                         # again.
1858
1859                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1860                                 break
1861
1862                         pagenum += 1
1863
1864                 all_ids_count = len(video_ids)
1865                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1866                 playlistend = self._downloader.params.get('playlistend', -1)
1867
1868                 if playlistend == -1:
1869                         video_ids = video_ids[playliststart:]
1870                 else:
1871                         video_ids = video_ids[playliststart:playlistend]
1872
1873                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1874                                 (username, all_ids_count, len(video_ids)))
1875
1876                 for video_id in video_ids:
1877                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1878
1879
1880 class BlipTVUserIE(InfoExtractor):
1881         """Information Extractor for blip.tv users."""
1882
1883         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1884         _PAGE_SIZE = 12
1885         IE_NAME = u'blip.tv:user'
1886
1887         def __init__(self, downloader=None):
1888                 InfoExtractor.__init__(self, downloader)
1889
1890         def report_download_page(self, username, pagenum):
1891                 """Report attempt to download user page."""
1892                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1893                                 (self.IE_NAME, username, pagenum))
1894
1895         def _real_extract(self, url):
1896                 # Extract username
1897                 mobj = re.match(self._VALID_URL, url)
1898                 if mobj is None:
1899                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1900                         return
1901
1902                 username = mobj.group(1)
1903
1904                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1905
1906                 request = urllib2.Request(url)
1907
1908                 try:
1909                         page = urllib2.urlopen(request).read().decode('utf-8')
1910                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1911                         page_base = page_base % mobj.group(1)
1912                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1913                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1914                         return
1915
1916
1917                 # Download video ids using BlipTV Ajax calls. Result size per
1918                 # query is limited (currently to 12 videos) so we need to query
1919                 # page by page until there are no video ids - it means we got
1920                 # all of them.
1921
1922                 video_ids = []
1923                 pagenum = 1
1924
1925                 while True:
1926                         self.report_download_page(username, pagenum)
1927
1928                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1929
1930                         try:
1931                                 page = urllib2.urlopen(request).read().decode('utf-8')
1932                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1933                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1934                                 return
1935
1936                         # Extract video identifiers
1937                         ids_in_page = []
1938
1939                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1940                                 if mobj.group(1) not in ids_in_page:
1941                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1942
1943                         video_ids.extend(ids_in_page)
1944
1945                         # A little optimization - if current page is not
1946                         # "full", ie. does not contain PAGE_SIZE video ids then
1947                         # we can assume that this page is the last one - there
1948                         # are no more ids on further pages - no need to query
1949                         # again.
1950
1951                         if len(ids_in_page) < self._PAGE_SIZE:
1952                                 break
1953
1954                         pagenum += 1
1955
1956                 all_ids_count = len(video_ids)
1957                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1958                 playlistend = self._downloader.params.get('playlistend', -1)
1959
1960                 if playlistend == -1:
1961                         video_ids = video_ids[playliststart:]
1962                 else:
1963                         video_ids = video_ids[playliststart:playlistend]
1964
1965                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1966                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1967
1968                 for video_id in video_ids:
1969                         self._downloader.download([u'http://blip.tv/'+video_id])
1970
1971
1972 class DepositFilesIE(InfoExtractor):
1973         """Information extractor for depositfiles.com"""
1974
1975         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1976         IE_NAME = u'DepositFiles'
1977
1978         def __init__(self, downloader=None):
1979                 InfoExtractor.__init__(self, downloader)
1980
1981         def report_download_webpage(self, file_id):
1982                 """Report webpage download."""
1983                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1984
1985         def report_extraction(self, file_id):
1986                 """Report information extraction."""
1987                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1988
1989         def _real_extract(self, url):
1990                 file_id = url.split('/')[-1]
1991                 # Rebuild url in english locale
1992                 url = 'http://depositfiles.com/en/files/' + file_id
1993
1994                 # Retrieve file webpage with 'Free download' button pressed
1995                 free_download_indication = { 'gateway_result' : '1' }
1996                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1997                 try:
1998                         self.report_download_webpage(file_id)
1999                         webpage = urllib2.urlopen(request).read()
2000                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % u(err))
2002                         return
2003
2004                 # Search for the real file URL
2005                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2006                 if (mobj is None) or (mobj.group(1) is None):
2007                         # Try to figure out reason of the error.
2008                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2009                         if (mobj is not None) and (mobj.group(1) is not None):
2010                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2011                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2012                         else:
2013                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2014                         return
2015
2016                 file_url = mobj.group(1)
2017                 file_extension = os.path.splitext(file_url)[1][1:]
2018
2019                 # Search for file title
2020                 mobj = re.search(r'<b title="(.*?)">', webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract title')
2023                         return
2024                 file_title = mobj.group(1).decode('utf-8')
2025
2026                 return [{
2027                         'id':           file_id.decode('utf-8'),
2028                         'url':          file_url.decode('utf-8'),
2029                         'uploader':     None,
2030                         'upload_date':  None,
2031                         'title':        file_title,
2032                         'ext':          file_extension.decode('utf-8'),
2033                 }]
2034
2035
2036 class FacebookIE(InfoExtractor):
2037         """Information Extractor for Facebook"""
2038
2039         _WORKING = False
2040         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2041         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2042         _NETRC_MACHINE = 'facebook'
2043         _available_formats = ['video', 'highqual', 'lowqual']
2044         _video_extensions = {
2045                 'video': 'mp4',
2046                 'highqual': 'mp4',
2047                 'lowqual': 'mp4',
2048         }
2049         IE_NAME = u'facebook'
2050
2051         def __init__(self, downloader=None):
2052                 InfoExtractor.__init__(self, downloader)
2053
2054         def _reporter(self, message):
2055                 """Add header and report message."""
2056                 self._downloader.to_screen(u'[facebook] %s' % message)
2057
2058         def report_login(self):
2059                 """Report attempt to log in."""
2060                 self._reporter(u'Logging in')
2061
2062         def report_video_webpage_download(self, video_id):
2063                 """Report attempt to download video webpage."""
2064                 self._reporter(u'%s: Downloading video webpage' % video_id)
2065
2066         def report_information_extraction(self, video_id):
2067                 """Report attempt to extract video information."""
2068                 self._reporter(u'%s: Extracting video information' % video_id)
2069
2070         def _parse_page(self, video_webpage):
2071                 """Extract video information from page"""
2072                 # General data
2073                 data = {'title': r'\("video_title", "(.*?)"\)',
2074                         'description': r'<div class="datawrap">(.*?)</div>',
2075                         'owner': r'\("video_owner_name", "(.*?)"\)',
2076                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2077                         }
2078                 video_info = {}
2079                 for piece in data.keys():
2080                         mobj = re.search(data[piece], video_webpage)
2081                         if mobj is not None:
2082                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2083
2084                 # Video urls
2085                 video_urls = {}
2086                 for fmt in self._available_formats:
2087                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2088                         if mobj is not None:
2089                                 # URL is in a Javascript segment inside an escaped Unicode format within
2090                                 # the generally utf-8 page
2091                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2092                 video_info['video_urls'] = video_urls
2093
2094                 return video_info
2095
2096         def _real_initialize(self):
2097                 if self._downloader is None:
2098                         return
2099
2100                 useremail = None
2101                 password = None
2102                 downloader_params = self._downloader.params
2103
2104                 # Attempt to use provided username and password or .netrc data
2105                 if downloader_params.get('username', None) is not None:
2106                         useremail = downloader_params['username']
2107                         password = downloader_params['password']
2108                 elif downloader_params.get('usenetrc', False):
2109                         try:
2110                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2111                                 if info is not None:
2112                                         useremail = info[0]
2113                                         password = info[2]
2114                                 else:
2115                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2116                         except (IOError, netrc.NetrcParseError), err:
2117                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
2118                                 return
2119
2120                 if useremail is None:
2121                         return
2122
2123                 # Log in
2124                 login_form = {
2125                         'email': useremail,
2126                         'pass': password,
2127                         'login': 'Log+In'
2128                         }
2129                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2130                 try:
2131                         self.report_login()
2132                         login_results = urllib2.urlopen(request).read()
2133                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2134                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2135                                 return
2136                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
2138                         return
2139
2140         def _real_extract(self, url):
2141                 mobj = re.match(self._VALID_URL, url)
2142                 if mobj is None:
2143                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2144                         return
2145                 video_id = mobj.group('ID')
2146
2147                 # Get video webpage
2148                 self.report_video_webpage_download(video_id)
2149                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2150                 try:
2151                         page = urllib2.urlopen(request)
2152                         video_webpage = page.read()
2153                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2155                         return
2156
2157                 # Start extracting information
2158                 self.report_information_extraction(video_id)
2159
2160                 # Extract information
2161                 video_info = self._parse_page(video_webpage)
2162
2163                 # uploader
2164                 if 'owner' not in video_info:
2165                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2166                         return
2167                 video_uploader = video_info['owner']
2168
2169                 # title
2170                 if 'title' not in video_info:
2171                         self._downloader.trouble(u'ERROR: unable to extract video title')
2172                         return
2173                 video_title = video_info['title']
2174                 video_title = video_title.decode('utf-8')
2175
2176                 # thumbnail image
2177                 if 'thumbnail' not in video_info:
2178                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2179                         video_thumbnail = ''
2180                 else:
2181                         video_thumbnail = video_info['thumbnail']
2182
2183                 # upload date
2184                 upload_date = None
2185                 if 'upload_date' in video_info:
2186                         upload_time = video_info['upload_date']
2187                         timetuple = email.utils.parsedate_tz(upload_time)
2188                         if timetuple is not None:
2189                                 try:
2190                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2191                                 except:
2192                                         pass
2193
2194                 # description
2195                 video_description = video_info.get('description', 'No description available.')
2196
2197                 url_map = video_info['video_urls']
2198                 if len(url_map.keys()) > 0:
2199                         # Decide which formats to download
2200                         req_format = self._downloader.params.get('format', None)
2201                         format_limit = self._downloader.params.get('format_limit', None)
2202
2203                         if format_limit is not None and format_limit in self._available_formats:
2204                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2205                         else:
2206                                 format_list = self._available_formats
2207                         existing_formats = [x for x in format_list if x in url_map]
2208                         if len(existing_formats) == 0:
2209                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2210                                 return
2211                         if req_format is None:
2212                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2213                         elif req_format == 'worst':
2214                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2215                         elif req_format == '-1':
2216                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2217                         else:
2218                                 # Specific format
2219                                 if req_format not in url_map:
2220                                         self._downloader.trouble(u'ERROR: requested format not available')
2221                                         return
2222                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2223
2224                 results = []
2225                 for format_param, video_real_url in video_url_list:
2226                         # Extension
2227                         video_extension = self._video_extensions.get(format_param, 'mp4')
2228
2229                         results.append({
2230                                 'id':           video_id.decode('utf-8'),
2231                                 'url':          video_real_url.decode('utf-8'),
2232                                 'uploader':     video_uploader.decode('utf-8'),
2233                                 'upload_date':  upload_date,
2234                                 'title':        video_title,
2235                                 'ext':          video_extension.decode('utf-8'),
2236                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2237                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2238                                 'description':  video_description.decode('utf-8'),
2239                         })
2240                 return results
2241
2242 class BlipTVIE(InfoExtractor):
2243         """Information extractor for blip.tv"""
2244
2245         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2246         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2247         IE_NAME = u'blip.tv'
2248
2249         def report_extraction(self, file_id):
2250                 """Report information extraction."""
2251                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2252
2253         def report_direct_download(self, title):
2254                 """Report information extraction."""
2255                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2256
2257         def _real_extract(self, url):
2258                 mobj = re.match(self._VALID_URL, url)
2259                 if mobj is None:
2260                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2261                         return
2262
2263                 if '?' in url:
2264                         cchar = '&'
2265                 else:
2266                         cchar = '?'
2267                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2268                 request = urllib2.Request(json_url.encode('utf-8'))
2269                 self.report_extraction(mobj.group(1))
2270                 info = None
2271                 try:
2272                         urlh = urllib2.urlopen(request)
2273                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2274                                 basename = url.split('/')[-1]
2275                                 title,ext = os.path.splitext(basename)
2276                                 title = title.decode('UTF-8')
2277                                 ext = ext.replace('.', '')
2278                                 self.report_direct_download(title)
2279                                 info = {
2280                                         'id': title,
2281                                         'url': url,
2282                                         'uploader': None,
2283                                         'upload_date': None,
2284                                         'title': title,
2285                                         'ext': ext,
2286                                         'urlhandle': urlh
2287                                 }
2288                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2289                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
2290                         return
2291                 if info is None: # Regular URL
2292                         try:
2293                                 json_code = urlh.read()
2294                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2295                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % u(err))
2296                                 return
2297
2298                         try:
2299                                 json_data = json.loads(json_code)
2300                                 if 'Post' in json_data:
2301                                         data = json_data['Post']
2302                                 else:
2303                                         data = json_data
2304
2305                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2306                                 video_url = data['media']['url']
2307                                 umobj = re.match(self._URL_EXT, video_url)
2308                                 if umobj is None:
2309                                         raise ValueError('Can not determine filename extension')
2310                                 ext = umobj.group(1)
2311
2312                                 info = {
2313                                         'id': data['item_id'],
2314                                         'url': video_url,
2315                                         'uploader': data['display_name'],
2316                                         'upload_date': upload_date,
2317                                         'title': data['title'],
2318                                         'ext': ext,
2319                                         'format': data['media']['mimeType'],
2320                                         'thumbnail': data['thumbnailUrl'],
2321                                         'description': data['description'],
2322                                         'player_url': data['embedUrl']
2323                                 }
2324                         except (ValueError,KeyError), err:
2325                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2326                                 return
2327
2328                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2329                 return [info]
2330
2331
2332 class MyVideoIE(InfoExtractor):
2333         """Information Extractor for myvideo.de."""
2334
2335         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2336         IE_NAME = u'myvideo'
2337
2338         def __init__(self, downloader=None):
2339                 InfoExtractor.__init__(self, downloader)
2340
2341         def report_download_webpage(self, video_id):
2342                 """Report webpage download."""
2343                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2344
2345         def report_extraction(self, video_id):
2346                 """Report information extraction."""
2347                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2348
2349         def _real_extract(self,url):
2350                 mobj = re.match(self._VALID_URL, url)
2351                 if mobj is None:
2352                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2353                         return
2354
2355                 video_id = mobj.group(1)
2356
2357                 # Get video webpage
2358                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2359                 try:
2360                         self.report_download_webpage(video_id)
2361                         webpage = urllib2.urlopen(request).read()
2362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2363                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
2364                         return
2365
2366                 self.report_extraction(video_id)
2367                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2368                                  webpage)
2369                 if mobj is None:
2370                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2371                         return
2372                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2373
2374                 mobj = re.search('<title>([^<]+)</title>', webpage)
2375                 if mobj is None:
2376                         self._downloader.trouble(u'ERROR: unable to extract title')
2377                         return
2378
2379                 video_title = mobj.group(1)
2380
2381                 return [{
2382                         'id':           video_id,
2383                         'url':          video_url,
2384                         'uploader':     None,
2385                         'upload_date':  None,
2386                         'title':        video_title,
2387                         'ext':          u'flv',
2388                 }]
2389
2390 class ComedyCentralIE(InfoExtractor):
2391         """Information extractor for The Daily Show and Colbert Report """
2392
2393         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2394         IE_NAME = u'comedycentral'
2395
2396         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2397
2398         _video_extensions = {
2399                 '3500': 'mp4',
2400                 '2200': 'mp4',
2401                 '1700': 'mp4',
2402                 '1200': 'mp4',
2403                 '750': 'mp4',
2404                 '400': 'mp4',
2405         }
2406         _video_dimensions = {
2407                 '3500': '1280x720',
2408                 '2200': '960x540',
2409                 '1700': '768x432',
2410                 '1200': '640x360',
2411                 '750': '512x288',
2412                 '400': '384x216',
2413         }
2414
2415         def report_extraction(self, episode_id):
2416                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2417
2418         def report_config_download(self, episode_id):
2419                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2420
2421         def report_index_download(self, episode_id):
2422                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2423
2424         def report_player_url(self, episode_id):
2425                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2426
2427
2428         def _print_formats(self, formats):
2429                 print('Available formats:')
2430                 for x in formats:
2431                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2432
2433
2434         def _real_extract(self, url):
2435                 mobj = re.match(self._VALID_URL, url)
2436                 if mobj is None:
2437                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2438                         return
2439
2440                 if mobj.group('shortname'):
2441                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2442                                 url = u'http://www.thedailyshow.com/full-episodes/'
2443                         else:
2444                                 url = u'http://www.colbertnation.com/full-episodes/'
2445                         mobj = re.match(self._VALID_URL, url)
2446                         assert mobj is not None
2447
2448                 dlNewest = not mobj.group('episode')
2449                 if dlNewest:
2450                         epTitle = mobj.group('showname')
2451                 else:
2452                         epTitle = mobj.group('episode')
2453
2454                 req = urllib2.Request(url)
2455                 self.report_extraction(epTitle)
2456                 try:
2457                         htmlHandle = urllib2.urlopen(req)
2458                         html = htmlHandle.read()
2459                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2461                         return
2462                 if dlNewest:
2463                         url = htmlHandle.geturl()
2464                         mobj = re.match(self._VALID_URL, url)
2465                         if mobj is None:
2466                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2467                                 return
2468                         if mobj.group('episode') == '':
2469                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2470                                 return
2471                         epTitle = mobj.group('episode')
2472
2473                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2474
2475                 if len(mMovieParams) == 0:
2476                         # The Colbert Report embeds the information in a without
2477                         # a URL prefix; so extract the alternate reference
2478                         # and then add the URL prefix manually.
2479
2480                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2481                         if len(altMovieParams) == 0:
2482                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2483                                 return
2484                         else:
2485                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2486
2487                 playerUrl_raw = mMovieParams[0][0]
2488                 self.report_player_url(epTitle)
2489                 try:
2490                         urlHandle = urllib2.urlopen(playerUrl_raw)
2491                         playerUrl = urlHandle.geturl()
2492                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2493                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + u(err))
2494                         return
2495
2496                 uri = mMovieParams[0][1]
2497                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2498                 self.report_index_download(epTitle)
2499                 try:
2500                         indexXml = urllib2.urlopen(indexUrl).read()
2501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2502                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + u(err))
2503                         return
2504
2505                 results = []
2506
2507                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2508                 itemEls = idoc.findall('.//item')
2509                 for itemEl in itemEls:
2510                         mediaId = itemEl.findall('./guid')[0].text
2511                         shortMediaId = mediaId.split(':')[-1]
2512                         showId = mediaId.split(':')[-2].replace('.com', '')
2513                         officialTitle = itemEl.findall('./title')[0].text
2514                         officialDate = itemEl.findall('./pubDate')[0].text
2515
2516                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2517                                                 urllib.urlencode({'uri': mediaId}))
2518                         configReq = urllib2.Request(configUrl)
2519                         self.report_config_download(epTitle)
2520                         try:
2521                                 configXml = urllib2.urlopen(configReq).read()
2522                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2523                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2524                                 return
2525
2526                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2527                         turls = []
2528                         for rendition in cdoc.findall('.//rendition'):
2529                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2530                                 turls.append(finfo)
2531
2532                         if len(turls) == 0:
2533                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2534                                 continue
2535
2536                         if self._downloader.params.get('listformats', None):
2537                                 self._print_formats([i[0] for i in turls])
2538                                 return
2539
2540                         # For now, just pick the highest bitrate
2541                         format,video_url = turls[-1]
2542
2543                         # Get the format arg from the arg stream
2544                         req_format = self._downloader.params.get('format', None)
2545
2546                         # Select format if we can find one
2547                         for f,v in turls:
2548                                 if f == req_format:
2549                                         format, video_url = f, v
2550                                         break
2551
2552                         # Patch to download from alternative CDN, which does not
2553                         # break on current RTMPDump builds
2554                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2555                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2556
2557                         if video_url.startswith(broken_cdn):
2558                                 video_url = video_url.replace(broken_cdn, better_cdn)
2559
2560                         effTitle = showId + u'-' + epTitle
2561                         info = {
2562                                 'id': shortMediaId,
2563                                 'url': video_url,
2564                                 'uploader': showId,
2565                                 'upload_date': officialDate,
2566                                 'title': effTitle,
2567                                 'ext': 'mp4',
2568                                 'format': format,
2569                                 'thumbnail': None,
2570                                 'description': officialTitle,
2571                                 'player_url': None #playerUrl
2572                         }
2573
2574                         results.append(info)
2575
2576                 return results
2577
2578
2579 class EscapistIE(InfoExtractor):
2580         """Information extractor for The Escapist """
2581
2582         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2583         IE_NAME = u'escapist'
2584
2585         def report_extraction(self, showName):
2586                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2587
2588         def report_config_download(self, showName):
2589                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2590
2591         def _real_extract(self, url):
2592                 mobj = re.match(self._VALID_URL, url)
2593                 if mobj is None:
2594                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2595                         return
2596                 showName = mobj.group('showname')
2597                 videoId = mobj.group('episode')
2598
2599                 self.report_extraction(showName)
2600                 try:
2601                         webPage = urllib2.urlopen(url)
2602                         webPageBytes = webPage.read()
2603                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2604                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2605                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2606                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + u(err))
2607                         return
2608
2609                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2610                 description = unescapeHTML(descMatch.group(1))
2611                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2612                 imgUrl = unescapeHTML(imgMatch.group(1))
2613                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2614                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2615                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2616                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2617
2618                 self.report_config_download(showName)
2619                 try:
2620                         configJSON = urllib2.urlopen(configUrl).read()
2621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + u(err))
2623                         return
2624
2625                 # Technically, it's JavaScript, not JSON
2626                 configJSON = configJSON.replace("'", '"')
2627
2628                 try:
2629                         config = json.loads(configJSON)
2630                 except (ValueError,), err:
2631                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + u(err))
2632                         return
2633
2634                 playlist = config['playlist']
2635                 videoUrl = playlist[1]['url']
2636
2637                 info = {
2638                         'id': videoId,
2639                         'url': videoUrl,
2640                         'uploader': showName,
2641                         'upload_date': None,
2642                         'title': showName,
2643                         'ext': 'flv',
2644                         'thumbnail': imgUrl,
2645                         'description': description,
2646                         'player_url': playerUrl,
2647                 }
2648
2649                 return [info]
2650
2651
2652 class CollegeHumorIE(InfoExtractor):
2653         """Information extractor for collegehumor.com"""
2654
2655         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2656         IE_NAME = u'collegehumor'
2657
2658         def report_webpage(self, video_id):
2659                 """Report information extraction."""
2660                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2661
2662         def report_extraction(self, video_id):
2663                 """Report information extraction."""
2664                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2665
2666         def _real_extract(self, url):
2667                 mobj = re.match(self._VALID_URL, url)
2668                 if mobj is None:
2669                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2670                         return
2671                 video_id = mobj.group('videoid')
2672
2673                 self.report_webpage(video_id)
2674                 request = urllib2.Request(url)
2675                 try:
2676                         webpage = urllib2.urlopen(request).read()
2677                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2678                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2679                         return
2680
2681                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2682                 if m is None:
2683                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2684                         return
2685                 internal_video_id = m.group('internalvideoid')
2686
2687                 info = {
2688                         'id': video_id,
2689                         'internal_id': internal_video_id,
2690                         'uploader': None,
2691                         'upload_date': None,
2692                 }
2693
2694                 self.report_extraction(video_id)
2695                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2696                 try:
2697                         metaXml = urllib2.urlopen(xmlUrl).read()
2698                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2699                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
2700                         return
2701
2702                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2703                 try:
2704                         videoNode = mdoc.findall('./video')[0]
2705                         info['description'] = videoNode.findall('./description')[0].text
2706                         info['title'] = videoNode.findall('./caption')[0].text
2707                         info['url'] = videoNode.findall('./file')[0].text
2708                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2709                         info['ext'] = info['url'].rpartition('.')[2]
2710                 except IndexError:
2711                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2712                         return
2713
2714                 return [info]
2715
2716
2717 class XVideosIE(InfoExtractor):
2718         """Information extractor for xvideos.com"""
2719
2720         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2721         IE_NAME = u'xvideos'
2722
2723         def report_webpage(self, video_id):
2724                 """Report information extraction."""
2725                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2726
2727         def report_extraction(self, video_id):
2728                 """Report information extraction."""
2729                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2730
2731         def _real_extract(self, url):
2732                 mobj = re.match(self._VALID_URL, url)
2733                 if mobj is None:
2734                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2735                         return
2736                 video_id = mobj.group(1).decode('utf-8')
2737
2738                 self.report_webpage(video_id)
2739
2740                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2741                 try:
2742                         webpage = urllib2.urlopen(request).read()
2743                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2744                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2745                         return
2746
2747                 self.report_extraction(video_id)
2748
2749
2750                 # Extract video URL
2751                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2752                 if mobj is None:
2753                         self._downloader.trouble(u'ERROR: unable to extract video url')
2754                         return
2755                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2756
2757
2758                 # Extract title
2759                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2760                 if mobj is None:
2761                         self._downloader.trouble(u'ERROR: unable to extract video title')
2762                         return
2763                 video_title = mobj.group(1).decode('utf-8')
2764
2765
2766                 # Extract video thumbnail
2767                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2768                 if mobj is None:
2769                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2770                         return
2771                 video_thumbnail = mobj.group(0).decode('utf-8')
2772
2773                 info = {
2774                         'id': video_id,
2775                         'url': video_url,
2776                         'uploader': None,
2777                         'upload_date': None,
2778                         'title': video_title,
2779                         'ext': 'flv',
2780                         'thumbnail': video_thumbnail,
2781                         'description': None,
2782                 }
2783
2784                 return [info]
2785
2786
2787 class SoundcloudIE(InfoExtractor):
2788         """Information extractor for soundcloud.com
2789            To access the media, the uid of the song and a stream token
2790            must be extracted from the page source and the script must make
2791            a request to media.soundcloud.com/crossdomain.xml. Then
2792            the media can be grabbed by requesting from an url composed
2793            of the stream token and uid
2794          """
2795
2796         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2797         IE_NAME = u'soundcloud'
2798
2799         def __init__(self, downloader=None):
2800                 InfoExtractor.__init__(self, downloader)
2801
2802         def report_webpage(self, video_id):
2803                 """Report information extraction."""
2804                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2805
2806         def report_extraction(self, video_id):
2807                 """Report information extraction."""
2808                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2809
2810         def _real_extract(self, url):
2811                 mobj = re.match(self._VALID_URL, url)
2812                 if mobj is None:
2813                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2814                         return
2815
2816                 # extract uploader (which is in the url)
2817                 uploader = mobj.group(1).decode('utf-8')
2818                 # extract simple title (uploader + slug of song title)
2819                 slug_title =  mobj.group(2).decode('utf-8')
2820                 simple_title = uploader + u'-' + slug_title
2821
2822                 self.report_webpage('%s/%s' % (uploader, slug_title))
2823
2824                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2825                 try:
2826                         webpage = urllib2.urlopen(request).read()
2827                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2828                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2829                         return
2830
2831                 self.report_extraction('%s/%s' % (uploader, slug_title))
2832
2833                 # extract uid and stream token that soundcloud hands out for access
2834                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2835                 if mobj:
2836                         video_id = mobj.group(1)
2837                         stream_token = mobj.group(2)
2838
2839                 # extract unsimplified title
2840                 mobj = re.search('"title":"(.*?)",', webpage)
2841                 if mobj:
2842                         title = mobj.group(1).decode('utf-8')
2843                 else:
2844                         title = simple_title
2845
2846                 # construct media url (with uid/token)
2847                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2848                 mediaURL = mediaURL % (video_id, stream_token)
2849
2850                 # description
2851                 description = u'No description available'
2852                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2853                 if mobj:
2854                         description = mobj.group(1)
2855
2856                 # upload date
2857                 upload_date = None
2858                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2859                 if mobj:
2860                         try:
2861                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2862                         except Exception, e:
2863                                 self._downloader.to_stderr(u(e))
2864
2865                 # for soundcloud, a request to a cross domain is required for cookies
2866                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2867
2868                 return [{
2869                         'id':           video_id.decode('utf-8'),
2870                         'url':          mediaURL,
2871                         'uploader':     uploader.decode('utf-8'),
2872                         'upload_date':  upload_date,
2873                         'title':        title,
2874                         'ext':          u'mp3',
2875                         'description': description.decode('utf-8')
2876                 }]
2877
2878
2879 class InfoQIE(InfoExtractor):
2880         """Information extractor for infoq.com"""
2881
2882         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2883         IE_NAME = u'infoq'
2884
2885         def report_webpage(self, video_id):
2886                 """Report information extraction."""
2887                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2888
2889         def report_extraction(self, video_id):
2890                 """Report information extraction."""
2891                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2892
2893         def _real_extract(self, url):
2894                 mobj = re.match(self._VALID_URL, url)
2895                 if mobj is None:
2896                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2897                         return
2898
2899                 self.report_webpage(url)
2900
2901                 request = urllib2.Request(url)
2902                 try:
2903                         webpage = urllib2.urlopen(request).read()
2904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2905                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2906                         return
2907
2908                 self.report_extraction(url)
2909
2910
2911                 # Extract video URL
2912                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2913                 if mobj is None:
2914                         self._downloader.trouble(u'ERROR: unable to extract video url')
2915                         return
2916                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2917
2918
2919                 # Extract title
2920                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2921                 if mobj is None:
2922                         self._downloader.trouble(u'ERROR: unable to extract video title')
2923                         return
2924                 video_title = mobj.group(1).decode('utf-8')
2925
2926                 # Extract description
2927                 video_description = u'No description available.'
2928                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2929                 if mobj is not None:
2930                         video_description = mobj.group(1).decode('utf-8')
2931
2932                 video_filename = video_url.split('/')[-1]
2933                 video_id, extension = video_filename.split('.')
2934
2935                 info = {
2936                         'id': video_id,
2937                         'url': video_url,
2938                         'uploader': None,
2939                         'upload_date': None,
2940                         'title': video_title,
2941                         'ext': extension, # Extension is always(?) mp4, but seems to be flv
2942                         'thumbnail': None,
2943                         'description': video_description,
2944                 }
2945
2946                 return [info]
2947
2948 class MixcloudIE(InfoExtractor):
2949         """Information extractor for www.mixcloud.com"""
2950         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2951         IE_NAME = u'mixcloud'
2952
2953         def __init__(self, downloader=None):
2954                 InfoExtractor.__init__(self, downloader)
2955
2956         def report_download_json(self, file_id):
2957                 """Report JSON download."""
2958                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2959
2960         def report_extraction(self, file_id):
2961                 """Report information extraction."""
2962                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2963
2964         def get_urls(self, jsonData, fmt, bitrate='best'):
2965                 """Get urls from 'audio_formats' section in json"""
2966                 file_url = None
2967                 try:
2968                         bitrate_list = jsonData[fmt]
2969                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2970                                 bitrate = max(bitrate_list) # select highest
2971
2972                         url_list = jsonData[fmt][bitrate]
2973                 except TypeError: # we have no bitrate info.
2974                         url_list = jsonData[fmt]
2975                 return url_list
2976
2977         def check_urls(self, url_list):
2978                 """Returns 1st active url from list"""
2979                 for url in url_list:
2980                         try:
2981                                 urllib2.urlopen(url)
2982                                 return url
2983                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2984                                 url = None
2985
2986                 return None
2987
2988         def _print_formats(self, formats):
2989                 print('Available formats:')
2990                 for fmt in formats.keys():
2991                         for b in formats[fmt]:
2992                                 try:
2993                                         ext = formats[fmt][b][0]
2994                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2995                                 except TypeError: # we have no bitrate info
2996                                         ext = formats[fmt][0]
2997                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2998                                         break
2999
3000         def _real_extract(self, url):
3001                 mobj = re.match(self._VALID_URL, url)
3002                 if mobj is None:
3003                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3004                         return
3005                 # extract uploader & filename from url
3006                 uploader = mobj.group(1).decode('utf-8')
3007                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3008
3009                 # construct API request
3010                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3011                 # retrieve .json file with links to files
3012                 request = urllib2.Request(file_url)
3013                 try:
3014                         self.report_download_json(file_url)
3015                         jsonData = urllib2.urlopen(request).read()
3016                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3017                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % u(err))
3018                         return
3019
3020                 # parse JSON
3021                 json_data = json.loads(jsonData)
3022                 player_url = json_data['player_swf_url']
3023                 formats = dict(json_data['audio_formats'])
3024
3025                 req_format = self._downloader.params.get('format', None)
3026                 bitrate = None
3027
3028                 if self._downloader.params.get('listformats', None):
3029                         self._print_formats(formats)
3030                         return
3031
3032                 if req_format is None or req_format == 'best':
3033                         for format_param in formats.keys():
3034                                 url_list = self.get_urls(formats, format_param)
3035                                 # check urls
3036                                 file_url = self.check_urls(url_list)
3037                                 if file_url is not None:
3038                                         break # got it!
3039                 else:
3040                         if req_format not in formats.keys():
3041                                 self._downloader.trouble(u'ERROR: format is not available')
3042                                 return
3043
3044                         url_list = self.get_urls(formats, req_format)
3045                         file_url = self.check_urls(url_list)
3046                         format_param = req_format
3047
3048                 return [{
3049                         'id': file_id.decode('utf-8'),
3050                         'url': file_url.decode('utf-8'),
3051                         'uploader':     uploader.decode('utf-8'),
3052                         'upload_date': None,
3053                         'title': json_data['name'],
3054                         'ext': file_url.split('.')[-1].decode('utf-8'),
3055                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3056                         'thumbnail': json_data['thumbnail_url'],
3057                         'description': json_data['description'],
3058                         'player_url': player_url.decode('utf-8'),
3059                 }]
3060
3061 class StanfordOpenClassroomIE(InfoExtractor):
3062         """Information extractor for Stanford's Open ClassRoom"""
3063
3064         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3065         IE_NAME = u'stanfordoc'
3066
3067         def report_download_webpage(self, objid):
3068                 """Report information extraction."""
3069                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3070
3071         def report_extraction(self, video_id):
3072                 """Report information extraction."""
3073                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3074
3075         def _real_extract(self, url):
3076                 mobj = re.match(self._VALID_URL, url)
3077                 if mobj is None:
3078                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3079                         return
3080
3081                 if mobj.group('course') and mobj.group('video'): # A specific video
3082                         course = mobj.group('course')
3083                         video = mobj.group('video')
3084                         info = {
3085                                 'id': course + '_' + video,
3086                                 'uploader': None,
3087                                 'upload_date': None,
3088                         }
3089
3090                         self.report_extraction(info['id'])
3091                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3092                         xmlUrl = baseUrl + video + '.xml'
3093                         try:
3094                                 metaXml = urllib2.urlopen(xmlUrl).read()
3095                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3096                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
3097                                 return
3098                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3099                         try:
3100                                 info['title'] = mdoc.findall('./title')[0].text
3101                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3102                         except IndexError:
3103                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3104                                 return
3105                         info['ext'] = info['url'].rpartition('.')[2]
3106                         return [info]
3107                 elif mobj.group('course'): # A course page
3108                         course = mobj.group('course')
3109                         info = {
3110                                 'id': course,
3111                                 'type': 'playlist',
3112                                 'uploader': None,
3113                                 'upload_date': None,
3114                         }
3115
3116                         self.report_download_webpage(info['id'])
3117                         try:
3118                                 coursepage = urllib2.urlopen(url).read()
3119                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3120                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3121                                 return
3122
3123                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3124                         if m:
3125                                 info['title'] = unescapeHTML(m.group(1))
3126                         else:
3127                                 info['title'] = info['id']
3128
3129                         m = re.search('<description>([^<]+)</description>', coursepage)
3130                         if m:
3131                                 info['description'] = unescapeHTML(m.group(1))
3132
3133                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3134                         info['list'] = [
3135                                 {
3136                                         'type': 'reference',
3137                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3138                                 }
3139                                         for vpage in links]
3140                         results = []
3141                         for entry in info['list']:
3142                                 assert entry['type'] == 'reference'
3143                                 results += self.extract(entry['url'])
3144                         return results
3145
3146                 else: # Root page
3147                         info = {
3148                                 'id': 'Stanford OpenClassroom',
3149                                 'type': 'playlist',
3150                                 'uploader': None,
3151                                 'upload_date': None,
3152                         }
3153
3154                         self.report_download_webpage(info['id'])
3155                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3156                         try:
3157                                 rootpage = urllib2.urlopen(rootURL).read()
3158                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3159                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3160                                 return
3161
3162                         info['title'] = info['id']
3163
3164                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3165                         info['list'] = [
3166                                 {
3167                                         'type': 'reference',
3168                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3169                                 }
3170                                         for cpage in links]
3171
3172                         results = []
3173                         for entry in info['list']:
3174                                 assert entry['type'] == 'reference'
3175                                 results += self.extract(entry['url'])
3176                         return results
3177
3178 class MTVIE(InfoExtractor):
3179         """Information extractor for MTV.com"""
3180
3181         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3182         IE_NAME = u'mtv'
3183
3184         def report_webpage(self, video_id):
3185                 """Report information extraction."""
3186                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3187
3188         def report_extraction(self, video_id):
3189                 """Report information extraction."""
3190                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3191
3192         def _real_extract(self, url):
3193                 mobj = re.match(self._VALID_URL, url)
3194                 if mobj is None:
3195                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3196                         return
3197                 if not mobj.group('proto'):
3198                         url = 'http://' + url
3199                 video_id = mobj.group('videoid')
3200                 self.report_webpage(video_id)
3201
3202                 request = urllib2.Request(url)
3203                 try:
3204                         webpage = urllib2.urlopen(request).read()
3205                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3206                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
3207                         return
3208
3209                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3210                 if mobj is None:
3211                         self._downloader.trouble(u'ERROR: unable to extract song name')
3212                         return
3213                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3215                 if mobj is None:
3216                         self._downloader.trouble(u'ERROR: unable to extract performer')
3217                         return
3218                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219                 video_title = performer + ' - ' + song_name
3220
3221                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3222                 if mobj is None:
3223                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3224                         return
3225                 mtvn_uri = mobj.group(1)
3226
3227                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3228                 if mobj is None:
3229                         self._downloader.trouble(u'ERROR: unable to extract content id')
3230                         return
3231                 content_id = mobj.group(1)
3232
3233                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3234                 self.report_extraction(video_id)
3235                 request = urllib2.Request(videogen_url)
3236                 try:
3237                         metadataXml = urllib2.urlopen(request).read()
3238                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3239                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % u(err))
3240                         return
3241
3242                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3243                 renditions = mdoc.findall('.//rendition')
3244
3245                 # For now, always pick the highest quality.
3246                 rendition = renditions[-1]
3247
3248                 try:
3249                         _,_,ext = rendition.attrib['type'].partition('/')
3250                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3251                         video_url = rendition.find('./src').text
3252                 except KeyError:
3253                         self._downloader.trouble('Invalid rendition field.')
3254                         return
3255
3256                 info = {
3257                         'id': video_id,
3258                         'url': video_url,
3259                         'uploader': performer,
3260                         'upload_date': None,
3261                         'title': video_title,
3262                         'ext': ext,
3263                         'format': format,
3264                 }
3265
3266                 return [info]
3267
3268
3269 class YoukuIE(InfoExtractor):
3270
3271         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3272         IE_NAME = u'Youku'
3273
3274         def __init__(self, downloader=None):
3275                 InfoExtractor.__init__(self, downloader)
3276
3277         def report_download_webpage(self, file_id):
3278                 """Report webpage download."""
3279                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3280
3281         def report_extraction(self, file_id):
3282                 """Report information extraction."""
3283                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3284
3285         def _gen_sid(self):
3286                 nowTime = int(time.time() * 1000)
3287                 random1 = random.randint(1000,1998)
3288                 random2 = random.randint(1000,9999)
3289
3290                 return "%d%d%d" %(nowTime,random1,random2)
3291
3292         def _get_file_ID_mix_string(self, seed):
3293                 mixed = []
3294                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3295                 seed = float(seed)
3296                 for i in range(len(source)):
3297                         seed  =  (seed * 211 + 30031 ) % 65536
3298                         index  =  math.floor(seed / 65536 * len(source) )
3299                         mixed.append(source[int(index)])
3300                         source.remove(source[int(index)])
3301                 #return ''.join(mixed)
3302                 return mixed
3303
3304         def _get_file_id(self, fileId, seed):
3305                 mixed = self._get_file_ID_mix_string(seed)
3306                 ids = fileId.split('*')
3307                 realId = []
3308                 for ch in ids:
3309                         if ch:
3310                                 realId.append(mixed[int(ch)])
3311                 return ''.join(realId)
3312
3313         def _real_extract(self, url):
3314                 mobj = re.match(self._VALID_URL, url)
3315                 if mobj is None:
3316                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3317                         return
3318                 video_id = mobj.group('ID')
3319
3320                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3321
3322                 request = urllib2.Request(info_url, None, std_headers)
3323                 try:
3324                         self.report_download_webpage(video_id)
3325                         jsondata = urllib2.urlopen(request).read()
3326                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3327                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3328                         return
3329
3330                 self.report_extraction(video_id)
3331                 try:
3332                         config = json.loads(jsondata)
3333
3334                         video_title =  config['data'][0]['title']
3335                         seed = config['data'][0]['seed']
3336
3337                         format = self._downloader.params.get('format', None)
3338                         supported_format = config['data'][0]['streamfileids'].keys()
3339
3340                         if format is None or format == 'best':
3341                                 if 'hd2' in supported_format:
3342                                         format = 'hd2'
3343                                 else:
3344                                         format = 'flv'
3345                                 ext = u'flv'
3346                         elif format == 'worst':
3347                                 format = 'mp4'
3348                                 ext = u'mp4'
3349                         else:
3350                                 format = 'flv'
3351                                 ext = u'flv'
3352
3353
3354                         fileid = config['data'][0]['streamfileids'][format]
3355                         seg_number = len(config['data'][0]['segs'][format])
3356
3357                         keys=[]
3358                         for i in xrange(seg_number):
3359                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3360
3361                         #TODO check error
3362                         #youku only could be viewed from mainland china
3363                 except:
3364                         self._downloader.trouble(u'ERROR: unable to extract info section')
3365                         return
3366
3367                 files_info=[]
3368                 sid = self._gen_sid()
3369                 fileid = self._get_file_id(fileid, seed)
3370
3371                 #column 8,9 of fileid represent the segment number
3372                 #fileid[7:9] should be changed
3373                 for index, key in enumerate(keys):
3374
3375                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3376                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3377
3378                         info = {
3379                                 'id': '%s_part%02d' % (video_id, index),
3380                                 'url': download_url,
3381                                 'uploader': None,
3382                                 'upload_date': None,
3383                                 'title': video_title,
3384                                 'ext': ext,
3385                         }
3386                         files_info.append(info)
3387
3388                 return files_info
3389
3390
3391 class XNXXIE(InfoExtractor):
3392         """Information extractor for xnxx.com"""
3393
3394         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3395         IE_NAME = u'xnxx'
3396         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3397         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3398         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3399
3400         def report_webpage(self, video_id):
3401                 """Report information extraction"""
3402                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3403
3404         def report_extraction(self, video_id):
3405                 """Report information extraction"""
3406                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3407
3408         def _real_extract(self, url):
3409                 mobj = re.match(self._VALID_URL, url)
3410                 if mobj is None:
3411                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3412                         return
3413                 video_id = mobj.group(1).decode('utf-8')
3414
3415                 self.report_webpage(video_id)
3416
3417                 # Get webpage content
3418                 try:
3419                         webpage = urllib2.urlopen(url).read()
3420                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3421                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3422                         return
3423
3424                 result = re.search(self.VIDEO_URL_RE, webpage)
3425                 if result is None:
3426                         self._downloader.trouble(u'ERROR: unable to extract video url')
3427                         return
3428                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3429
3430                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3431                 if result is None:
3432                         self._downloader.trouble(u'ERROR: unable to extract video title')
3433                         return
3434                 video_title = result.group(1).decode('utf-8')
3435
3436                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3437                 if result is None:
3438                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3439                         return
3440                 video_thumbnail = result.group(1).decode('utf-8')
3441
3442                 return [{
3443                         'id': video_id,
3444                         'url': video_url,
3445                         'uploader': None,
3446                         'upload_date': None,
3447                         'title': video_title,
3448                         'ext': 'flv',
3449                         'thumbnail': video_thumbnail,
3450                         'description': None,
3451                 }]
3452
3453
3454 class GooglePlusIE(InfoExtractor):
3455         """Information extractor for plus.google.com."""
3456
3457         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3458         IE_NAME = u'plus.google'
3459
3460         def __init__(self, downloader=None):
3461                 InfoExtractor.__init__(self, downloader)
3462
3463         def report_extract_entry(self, url):
3464                 """Report downloading extry"""
3465                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3466
3467         def report_date(self, upload_date):
3468                 """Report downloading extry"""
3469                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3470
3471         def report_uploader(self, uploader):
3472                 """Report downloading extry"""
3473                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3474
3475         def report_title(self, video_title):
3476                 """Report downloading extry"""
3477                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3478
3479         def report_extract_vid_page(self, video_page):
3480                 """Report information extraction."""
3481                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3482
3483         def _real_extract(self, url):
3484                 # Extract id from URL
3485                 mobj = re.match(self._VALID_URL, url)
3486                 if mobj is None:
3487                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3488                         return
3489
3490                 post_url = mobj.group(0)
3491                 video_id = mobj.group(2)
3492
3493                 video_extension = 'flv'
3494
3495                 # Step 1, Retrieve post webpage to extract further information
3496                 self.report_extract_entry(post_url)
3497                 request = urllib2.Request(post_url)
3498                 try:
3499                         webpage = urllib2.urlopen(request).read()
3500                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3501                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % u(err))
3502                         return
3503
3504                 # Extract update date
3505                 upload_date = None
3506                 pattern = 'title="Timestamp">(.*?)</a>'
3507                 mobj = re.search(pattern, webpage)
3508                 if mobj:
3509                         upload_date = mobj.group(1)
3510                         # Convert timestring to a format suitable for filename
3511                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3512                         upload_date = upload_date.strftime('%Y%m%d')
3513                 self.report_date(upload_date)
3514
3515                 # Extract uploader
3516                 uploader = None
3517                 pattern = r'rel\="author".*?>(.*?)</a>'
3518                 mobj = re.search(pattern, webpage)
3519                 if mobj:
3520                         uploader = mobj.group(1)
3521                 self.report_uploader(uploader)
3522
3523                 # Extract title
3524                 # Get the first line for title
3525                 video_title = u'NA'
3526                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3527                 mobj = re.search(pattern, webpage)
3528                 if mobj:
3529                         video_title = mobj.group(1)
3530                 self.report_title(video_title)
3531
3532                 # Step 2, Stimulate clicking the image box to launch video
3533                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3534                 mobj = re.search(pattern, webpage)
3535                 if mobj is None:
3536                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3537
3538                 video_page = mobj.group(1)
3539                 request = urllib2.Request(video_page)
3540                 try:
3541                         webpage = urllib2.urlopen(request).read()
3542                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3543                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3544                         return
3545                 self.report_extract_vid_page(video_page)
3546
3547
3548                 # Extract video links on video page
3549                 """Extract video links of all sizes"""
3550                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3551                 mobj = re.findall(pattern, webpage)
3552                 if len(mobj) == 0:
3553                         self._downloader.trouble(u'ERROR: unable to extract video links')
3554
3555                 # Sort in resolution
3556                 links = sorted(mobj)
3557
3558                 # Choose the lowest of the sort, i.e. highest resolution
3559                 video_url = links[-1]
3560                 # Only get the url. The resolution part in the tuple has no use anymore
3561                 video_url = video_url[-1]
3562                 # Treat escaped \u0026 style hex
3563                 video_url = unicode(video_url, "unicode_escape")
3564
3565
3566                 return [{
3567                         'id':           video_id.decode('utf-8'),
3568                         'url':          video_url,
3569                         'uploader':     uploader.decode('utf-8'),
3570                         'upload_date':  upload_date.decode('utf-8'),
3571                         'title':        video_title.decode('utf-8'),
3572                         'ext':          video_extension.decode('utf-8'),
3573                 }]