git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 106                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 107                              (?:                                                  # the various things that can precede the ID:
 108                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 109                                  |(?:                                             # or the v= param in all its forms
 110                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 111                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 112                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 113                                      v=
 114                                  )
 115                              )?                                                   # optional -> youtube.com/xxxx is OK
 116                          )?                                                       # all until now is optional -> you can pass the naked ID
 117                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 118                          (?(1).+)?                                                # if we found the ID, everything can follow
 119                          $"""
 120         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 121         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 122         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 123         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 124         _NETRC_MACHINE = 'youtube'
 125         # Listed in order of quality
 126         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 127         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 128         _video_extensions = {
 129                 '13': '3gp',
 130                 '17': 'mp4',
 131                 '18': 'mp4',
 132                 '22': 'mp4',
 133                 '37': 'mp4',
 134                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 135                 '43': 'webm',
 136                 '44': 'webm',
 137                 '45': 'webm',
 138                 '46': 'webm',
 139         }
 140         _video_dimensions = {
 141                 '5': '240x400',
 142                 '6': '???',
 143                 '13': '???',
 144                 '17': '144x176',
 145                 '18': '360x640',
 146                 '22': '720x1280',
 147                 '34': '360x640',
 148                 '35': '480x854',
 149                 '37': '1080x1920',
 150                 '38': '3072x4096',
 151                 '43': '360x640',
 152                 '44': '480x854',
 153                 '45': '720x1280',
 154                 '46': '1080x1920',
 155         }
 156         IE_NAME = u'youtube'
 157
 158         def suitable(self, url):
 159                 """Receives a URL and returns True if suitable for this IE."""
 160                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 161
 162         def report_lang(self):
 163                 """Report attempt to set language."""
 164                 self._downloader.to_screen(u'[youtube] Setting language')
 165
 166         def report_login(self):
 167                 """Report attempt to log in."""
 168                 self._downloader.to_screen(u'[youtube] Logging in')
 169
 170         def report_age_confirmation(self):
 171                 """Report attempt to confirm age."""
 172                 self._downloader.to_screen(u'[youtube] Confirming age')
 173
 174         def report_video_webpage_download(self, video_id):
 175                 """Report attempt to download video webpage."""
 176                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 177
 178         def report_video_info_webpage_download(self, video_id):
 179                 """Report attempt to download video info webpage."""
 180                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 181
 182         def report_video_subtitles_download(self, video_id):
 183                 """Report attempt to download video info webpage."""
 184                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 185
 186         def report_information_extraction(self, video_id):
 187                 """Report attempt to extract video information."""
 188                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 189
 190         def report_unavailable_format(self, video_id, format):
 191                 """Report extracted video URL."""
 192                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 193
 194         def report_rtmp_download(self):
 195                 """Indicate the download will use the RTMP protocol."""
 196                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 197
 198         def _closed_captions_xml_to_srt(self, xml_string):
 199                 srt = ''
 200                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 201                 # TODO parse xml instead of regex
 202                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 203                         if not dur: dur = '4'
 204                         start = float(start)
 205                         end = start + float(dur)
 206                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 207                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 208                         caption = unescapeHTML(caption)
 209                         caption = unescapeHTML(caption) # double cycle, intentional
 210                         srt += str(n+1) + '\n'
 211                         srt += start + ' --> ' + end + '\n'
 212                         srt += caption + '\n\n'
 213                 return srt
 214
 215         def _print_formats(self, formats):
 216                 print('Available formats:')
 217                 for x in formats:
 218                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 219
 220         def _real_initialize(self):
 221                 if self._downloader is None:
 222                         return
 223
 224                 username = None
 225                 password = None
 226                 downloader_params = self._downloader.params
 227
 228                 # Attempt to use provided username and password or .netrc data
 229                 if downloader_params.get('username', None) is not None:
 230                         username = downloader_params['username']
 231                         password = downloader_params['password']
 232                 elif downloader_params.get('usenetrc', False):
 233                         try:
 234                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 235                                 if info is not None:
 236                                         username = info[0]
 237                                         password = info[2]
 238                                 else:
 239                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 240                         except (IOError, netrc.NetrcParseError), err:
 241                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
 242                                 return
 243
 244                 # Set language
 245                 request = urllib2.Request(self._LANG_URL)
 246                 try:
 247                         self.report_lang()
 248                         urllib2.urlopen(request).read()
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % u(err))
 251                         return
 252
 253                 # No authentication to be performed
 254                 if username is None:
 255                         return
 256
 257                 # Log in
 258                 login_form = {
 259                                 'current_form': 'loginForm',
 260                                 'next':         '/',
 261                                 'action_login': 'Log In',
 262                                 'username':     username,
 263                                 'password':     password,
 264                                 }
 265                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 266                 try:
 267                         self.report_login()
 268                         login_results = urllib2.urlopen(request).read()
 269                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 270                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 271                                 return
 272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 273                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
 274                         return
 275
 276                 # Confirm age
 277                 age_form = {
 278                                 'next_url':             '/',
 279                                 'action_confirm':       'Confirm',
 280                                 }
 281                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 282                 try:
 283                         self.report_age_confirmation()
 284                         age_results = urllib2.urlopen(request).read()
 285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 286                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
 287                         return
 288
 289         def _real_extract(self, url):
 290                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 291                 mobj = re.search(self._NEXT_URL_RE, url)
 292                 if mobj:
 293                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 294
 295                 # Extract video id from URL
 296                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 297                 if mobj is None:
 298                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 299                         return
 300                 video_id = mobj.group(2)
 301
 302                 # Get video webpage
 303                 self.report_video_webpage_download(video_id)
 304                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 305                 try:
 306                         video_webpage = urllib2.urlopen(request).read()
 307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 308                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
 309                         return
 310
 311                 # Attempt to extract SWF player URL
 312                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 313                 if mobj is not None:
 314                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 315                 else:
 316                         player_url = None
 317
 318                 # Get video info
 319                 self.report_video_info_webpage_download(video_id)
 320                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 321                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 322                                         % (video_id, el_type))
 323                         request = urllib2.Request(video_info_url)
 324                         try:
 325                                 video_info_webpage = urllib2.urlopen(request).read()
 326                                 video_info = parse_qs(video_info_webpage)
 327                                 if 'token' in video_info:
 328                                         break
 329                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 330                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
 331                                 return
 332                 if 'token' not in video_info:
 333                         if 'reason' in video_info:
 334                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 335                         else:
 336                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 337                         return
 338
 339                 # Check for "rental" videos
 340                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 341                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 342                         return
 343
 344                 # Start extracting information
 345                 self.report_information_extraction(video_id)
 346
 347                 # uploader
 348                 if 'author' not in video_info:
 349                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 350                         return
 351                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 352
 353                 # title
 354                 if 'title' not in video_info:
 355                         self._downloader.trouble(u'ERROR: unable to extract video title')
 356                         return
 357                 video_title = urllib.unquote_plus(video_info['title'][0])
 358                 video_title = video_title.decode('utf-8')
 359
 360                 # thumbnail image
 361                 if 'thumbnail_url' not in video_info:
 362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 363                         video_thumbnail = ''
 364                 else:   # don't panic if we can't find it
 365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 366
 367                 # upload date
 368                 upload_date = u'NA'
 369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 370                 if mobj is not None:
 371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 373                         for expression in format_expressions:
 374                                 try:
 375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 376                                 except:
 377                                         pass
 378
 379                 # description
 380                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 381                 if video_description: video_description = clean_html(video_description)
 382                 else: video_description = ''
 383
 384                 # closed captions
 385                 video_subtitles = None
 386                 if self._downloader.params.get('writesubtitles', False):
 387                         try:
 388                                 self.report_video_subtitles_download(video_id)
 389                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 390                                 try:
 391                                         srt_list = urllib2.urlopen(request).read()
 392                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 393                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
 394                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 395                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 396                                 if not srt_lang_list:
 397                                         raise Trouble(u'WARNING: video has no closed captions')
 398                                 if self._downloader.params.get('subtitleslang', False):
 399                                         srt_lang = self._downloader.params.get('subtitleslang')
 400                                 elif 'en' in srt_lang_list:
 401                                         srt_lang = 'en'
 402                                 else:
 403                                         srt_lang = srt_lang_list.keys()[0]
 404                                 if not srt_lang in srt_lang_list:
 405                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 406                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 407                                 try:
 408                                         srt_xml = urllib2.urlopen(request).read()
 409                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 410                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err))
 411                                 if not srt_xml:
 412                                         raise Trouble(u'WARNING: unable to download video subtitles')
 413                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 414                         except Trouble as trouble:
 415                                 self._downloader.trouble(trouble[0])
 416
 417                 if 'length_seconds' not in video_info:
 418                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 419                         video_duration = ''
 420                 else:
 421                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
 422
 423                 # token
 424                 video_token = urllib.unquote_plus(video_info['token'][0])
 425
 426                 # Decide which formats to download
 427                 req_format = self._downloader.params.get('format', None)
 428
 429                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 430                         self.report_rtmp_download()
 431                         video_url_list = [(None, video_info['conn'][0])]
 432                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 433                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 434                         url_data = [parse_qs(uds) for uds in url_data_strs]
 435                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 436                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 437
 438                         format_limit = self._downloader.params.get('format_limit', None)
 439                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 440                         if format_limit is not None and format_limit in available_formats:
 441                                 format_list = available_formats[available_formats.index(format_limit):]
 442                         else:
 443                                 format_list = available_formats
 444                         existing_formats = [x for x in format_list if x in url_map]
 445                         if len(existing_formats) == 0:
 446                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 447                                 return
 448                         if self._downloader.params.get('listformats', None):
 449                                 self._print_formats(existing_formats)
 450                                 return
 451                         if req_format is None or req_format == 'best':
 452                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 453                         elif req_format == 'worst':
 454                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 455                         elif req_format in ('-1', 'all'):
 456                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 457                         else:
 458                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 459                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 460                                 req_formats = req_format.split('/')
 461                                 video_url_list = None
 462                                 for rf in req_formats:
 463                                         if rf in url_map:
 464                                                 video_url_list = [(rf, url_map[rf])]
 465                                                 break
 466                                 if video_url_list is None:
 467                                         self._downloader.trouble(u'ERROR: requested format not available')
 468                                         return
 469                 else:
 470                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 471                         return
 472
 473                 results = []
 474                 for format_param, video_real_url in video_url_list:
 475                         # Extension
 476                         video_extension = self._video_extensions.get(format_param, 'flv')
 477
 478                         results.append({
 479                                 'id':           video_id.decode('utf-8'),
 480                                 'url':          video_real_url.decode('utf-8'),
 481                                 'uploader':     video_uploader.decode('utf-8'),
 482                                 'upload_date':  upload_date,
 483                                 'title':        video_title,
 484                                 'ext':          video_extension.decode('utf-8'),
 485                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 486                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 487                                 'description':  video_description,
 488                                 'player_url':   player_url,
 489                                 'subtitles':    video_subtitles,
 490                                 'duration':             video_duration
 491                         })
 492                 return results
 493
 494
 495 class MetacafeIE(InfoExtractor):
 496         """Information Extractor for metacafe.com."""
 497
 498         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 499         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 500         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 501         IE_NAME = u'metacafe'
 502
 503         def __init__(self, downloader=None):
 504                 InfoExtractor.__init__(self, downloader)
 505
 506         def report_disclaimer(self):
 507                 """Report disclaimer retrieval."""
 508                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 509
 510         def report_age_confirmation(self):
 511                 """Report attempt to confirm age."""
 512                 self._downloader.to_screen(u'[metacafe] Confirming age')
 513
 514         def report_download_webpage(self, video_id):
 515                 """Report webpage download."""
 516                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 517
 518         def report_extraction(self, video_id):
 519                 """Report information extraction."""
 520                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 521
 522         def _real_initialize(self):
 523                 # Retrieve disclaimer
 524                 request = urllib2.Request(self._DISCLAIMER)
 525                 try:
 526                         self.report_disclaimer()
 527                         disclaimer = urllib2.urlopen(request).read()
 528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 529                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % u(err))
 530                         return
 531
 532                 # Confirm age
 533                 disclaimer_form = {
 534                         'filters': '0',
 535                         'submit': "Continue - I'm over 18",
 536                         }
 537                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 538                 try:
 539                         self.report_age_confirmation()
 540                         disclaimer = urllib2.urlopen(request).read()
 541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 542                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err))
 543                         return
 544
 545         def _real_extract(self, url):
 546                 # Extract id and simplified title from URL
 547                 mobj = re.match(self._VALID_URL, url)
 548                 if mobj is None:
 549                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 550                         return
 551
 552                 video_id = mobj.group(1)
 553
 554                 # Check if video comes from YouTube
 555                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 556                 if mobj2 is not None:
 557                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 558                         return
 559
 560                 # Retrieve video webpage to extract further information
 561                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 562                 try:
 563                         self.report_download_webpage(video_id)
 564                         webpage = urllib2.urlopen(request).read()
 565                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 566                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
 567                         return
 568
 569                 # Extract URL, uploader and title from webpage
 570                 self.report_extraction(video_id)
 571                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 572                 if mobj is not None:
 573                         mediaURL = urllib.unquote(mobj.group(1))
 574                         video_extension = mediaURL[-3:]
 575
 576                         # Extract gdaKey if available
 577                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 578                         if mobj is None:
 579                                 video_url = mediaURL
 580                         else:
 581                                 gdaKey = mobj.group(1)
 582                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 583                 else:
 584                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         vardict = parse_qs(mobj.group(1))
 589                         if 'mediaData' not in vardict:
 590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 591                                 return
 592                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 593                         if mobj is None:
 594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 595                                 return
 596                         mediaURL = mobj.group(1).replace('\\/', '/')
 597                         video_extension = mediaURL[-3:]
 598                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 599
 600                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 601                 if mobj is None:
 602                         self._downloader.trouble(u'ERROR: unable to extract title')
 603                         return
 604                 video_title = mobj.group(1).decode('utf-8')
 605
 606                 mobj = re.search(r'submitter=(.*?);', webpage)
 607                 if mobj is None:
 608                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 609                         return
 610                 video_uploader = mobj.group(1)
 611
 612                 return [{
 613                         'id':           video_id.decode('utf-8'),
 614                         'url':          video_url.decode('utf-8'),
 615                         'uploader':     video_uploader.decode('utf-8'),
 616                         'upload_date':  u'NA',
 617                         'title':        video_title,
 618                         'ext':          video_extension.decode('utf-8'),
 619                         'format':       u'NA',
 620                         'player_url':   None,
 621                 }]
 622
 623
 624 class DailymotionIE(InfoExtractor):
 625         """Information Extractor for Dailymotion"""
 626
 627         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 628         IE_NAME = u'dailymotion'
 629
 630         def __init__(self, downloader=None):
 631                 InfoExtractor.__init__(self, downloader)
 632
 633         def report_download_webpage(self, video_id):
 634                 """Report webpage download."""
 635                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 636
 637         def report_extraction(self, video_id):
 638                 """Report information extraction."""
 639                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 640
 641         def _real_extract(self, url):
 642                 # Extract id and simplified title from URL
 643                 mobj = re.match(self._VALID_URL, url)
 644                 if mobj is None:
 645                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 646                         return
 647
 648                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 649
 650                 video_extension = 'mp4'
 651
 652                 # Retrieve video webpage to extract further information
 653                 request = urllib2.Request(url)
 654                 request.add_header('Cookie', 'family_filter=off')
 655                 try:
 656                         self.report_download_webpage(video_id)
 657                         webpage = urllib2.urlopen(request).read()
 658                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 659                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err))
 660                         return
 661
 662                 # Extract URL, uploader and title from webpage
 663                 self.report_extraction(video_id)
 664                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 665                 if mobj is None:
 666                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 667                         return
 668                 flashvars = urllib.unquote(mobj.group(1))
 669
 670                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 671                         if key in flashvars:
 672                                 max_quality = key
 673                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 674                                 break
 675                 else:
 676                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 677                         return
 678
 679                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 680                 if mobj is None:
 681                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 682                         return
 683
 684                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 685
 686                 # TODO: support choosing qualities
 687
 688                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 689                 if mobj is None:
 690                         self._downloader.trouble(u'ERROR: unable to extract title')
 691                         return
 692                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 693
 694                 video_uploader = u'NA'
 695                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 696                 if mobj is None:
 697                         # lookin for official user
 698                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 699                         if mobj_official is None:
 700                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 701                         else:
 702                                 video_uploader = mobj_official.group(1)
 703                 else:
 704                         video_uploader = mobj.group(1)
 705
 706                 video_upload_date = u'NA'
 707                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 708                 if mobj is not None:
 709                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 710
 711                 return [{
 712                         'id':           video_id.decode('utf-8'),
 713                         'url':          video_url.decode('utf-8'),
 714                         'uploader':     video_uploader.decode('utf-8'),
 715                         'upload_date':  video_upload_date,
 716                         'title':        video_title,
 717                         'ext':          video_extension.decode('utf-8'),
 718                         'format':       u'NA',
 719                         'player_url':   None,
 720                 }]
 721
 722
 723 class GoogleIE(InfoExtractor):
 724         """Information extractor for video.google.com."""
 725
 726         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 727         IE_NAME = u'video.google'
 728
 729         def __init__(self, downloader=None):
 730                 InfoExtractor.__init__(self, downloader)
 731
 732         def report_download_webpage(self, video_id):
 733                 """Report webpage download."""
 734                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 735
 736         def report_extraction(self, video_id):
 737                 """Report information extraction."""
 738                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 739
 740         def _real_extract(self, url):
 741                 # Extract id from URL
 742                 mobj = re.match(self._VALID_URL, url)
 743                 if mobj is None:
 744                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 745                         return
 746
 747                 video_id = mobj.group(1)
 748
 749                 video_extension = 'mp4'
 750
 751                 # Retrieve video webpage to extract further information
 752                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 753                 try:
 754                         self.report_download_webpage(video_id)
 755                         webpage = urllib2.urlopen(request).read()
 756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 757                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 758                         return
 759
 760                 # Extract URL, uploader, and title from webpage
 761                 self.report_extraction(video_id)
 762                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 763                 if mobj is None:
 764                         video_extension = 'flv'
 765                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 766                 if mobj is None:
 767                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 768                         return
 769                 mediaURL = urllib.unquote(mobj.group(1))
 770                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 771                 mediaURL = mediaURL.replace('\\x26', '\x26')
 772
 773                 video_url = mediaURL
 774
 775                 mobj = re.search(r'<title>(.*)</title>', webpage)
 776                 if mobj is None:
 777                         self._downloader.trouble(u'ERROR: unable to extract title')
 778                         return
 779                 video_title = mobj.group(1).decode('utf-8')
 780
 781                 # Extract video description
 782                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 783                 if mobj is None:
 784                         self._downloader.trouble(u'ERROR: unable to extract video description')
 785                         return
 786                 video_description = mobj.group(1).decode('utf-8')
 787                 if not video_description:
 788                         video_description = 'No description available.'
 789
 790                 # Extract video thumbnail
 791                 if self._downloader.params.get('forcethumbnail', False):
 792                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 793                         try:
 794                                 webpage = urllib2.urlopen(request).read()
 795                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 796                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 797                                 return
 798                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 799                         if mobj is None:
 800                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 801                                 return
 802                         video_thumbnail = mobj.group(1)
 803                 else:   # we need something to pass to process_info
 804                         video_thumbnail = ''
 805
 806                 return [{
 807                         'id':           video_id.decode('utf-8'),
 808                         'url':          video_url.decode('utf-8'),
 809                         'uploader':     u'NA',
 810                         'upload_date':  u'NA',
 811                         'title':        video_title,
 812                         'ext':          video_extension.decode('utf-8'),
 813                         'format':       u'NA',
 814                         'player_url':   None,
 815                 }]
 816
 817
 818 class PhotobucketIE(InfoExtractor):
 819         """Information extractor for photobucket.com."""
 820
 821         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 822         IE_NAME = u'photobucket'
 823
 824         def __init__(self, downloader=None):
 825                 InfoExtractor.__init__(self, downloader)
 826
 827         def report_download_webpage(self, video_id):
 828                 """Report webpage download."""
 829                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 830
 831         def report_extraction(self, video_id):
 832                 """Report information extraction."""
 833                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 834
 835         def _real_extract(self, url):
 836                 # Extract id from URL
 837                 mobj = re.match(self._VALID_URL, url)
 838                 if mobj is None:
 839                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 840                         return
 841
 842                 video_id = mobj.group(1)
 843
 844                 video_extension = 'flv'
 845
 846                 # Retrieve video webpage to extract further information
 847                 request = urllib2.Request(url)
 848                 try:
 849                         self.report_download_webpage(video_id)
 850                         webpage = urllib2.urlopen(request).read()
 851                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 852                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 853                         return
 854
 855                 # Extract URL, uploader, and title from webpage
 856                 self.report_extraction(video_id)
 857                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 858                 if mobj is None:
 859                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 860                         return
 861                 mediaURL = urllib.unquote(mobj.group(1))
 862
 863                 video_url = mediaURL
 864
 865                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 866                 if mobj is None:
 867                         self._downloader.trouble(u'ERROR: unable to extract title')
 868                         return
 869                 video_title = mobj.group(1).decode('utf-8')
 870
 871                 video_uploader = mobj.group(2).decode('utf-8')
 872
 873                 return [{
 874                         'id':           video_id.decode('utf-8'),
 875                         'url':          video_url.decode('utf-8'),
 876                         'uploader':     video_uploader,
 877                         'upload_date':  u'NA',
 878                         'title':        video_title,
 879                         'ext':          video_extension.decode('utf-8'),
 880                         'format':       u'NA',
 881                         'player_url':   None,
 882                 }]
 883
 884
 885 class YahooIE(InfoExtractor):
 886         """Information extractor for video.yahoo.com."""
 887
 888         # _VALID_URL matches all Yahoo! Video URLs
 889         # _VPAGE_URL matches only the extractable '/watch/' URLs
 890         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 891         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 892         IE_NAME = u'video.yahoo'
 893
 894         def __init__(self, downloader=None):
 895                 InfoExtractor.__init__(self, downloader)
 896
 897         def report_download_webpage(self, video_id):
 898                 """Report webpage download."""
 899                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 900
 901         def report_extraction(self, video_id):
 902                 """Report information extraction."""
 903                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 904
 905         def _real_extract(self, url, new_video=True):
 906                 # Extract ID from URL
 907                 mobj = re.match(self._VALID_URL, url)
 908                 if mobj is None:
 909                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 910                         return
 911
 912                 video_id = mobj.group(2)
 913                 video_extension = 'flv'
 914
 915                 # Rewrite valid but non-extractable URLs as
 916                 # extractable English language /watch/ URLs
 917                 if re.match(self._VPAGE_URL, url) is None:
 918                         request = urllib2.Request(url)
 919                         try:
 920                                 webpage = urllib2.urlopen(request).read()
 921                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 922                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 923                                 return
 924
 925                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 926                         if mobj is None:
 927                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 928                                 return
 929                         yahoo_id = mobj.group(1)
 930
 931                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 932                         if mobj is None:
 933                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 934                                 return
 935                         yahoo_vid = mobj.group(1)
 936
 937                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 938                         return self._real_extract(url, new_video=False)
 939
 940                 # Retrieve video webpage to extract further information
 941                 request = urllib2.Request(url)
 942                 try:
 943                         self.report_download_webpage(video_id)
 944                         webpage = urllib2.urlopen(request).read()
 945                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 946                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
 947                         return
 948
 949                 # Extract uploader and title from webpage
 950                 self.report_extraction(video_id)
 951                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video title')
 954                         return
 955                 video_title = mobj.group(1).decode('utf-8')
 956
 957                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 958                 if mobj is None:
 959                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 960                         return
 961                 video_uploader = mobj.group(1).decode('utf-8')
 962
 963                 # Extract video thumbnail
 964                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 965                 if mobj is None:
 966                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 967                         return
 968                 video_thumbnail = mobj.group(1).decode('utf-8')
 969
 970                 # Extract video description
 971                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 972                 if mobj is None:
 973                         self._downloader.trouble(u'ERROR: unable to extract video description')
 974                         return
 975                 video_description = mobj.group(1).decode('utf-8')
 976                 if not video_description:
 977                         video_description = 'No description available.'
 978
 979                 # Extract video height and width
 980                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 981                 if mobj is None:
 982                         self._downloader.trouble(u'ERROR: unable to extract video height')
 983                         return
 984                 yv_video_height = mobj.group(1)
 985
 986                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 987                 if mobj is None:
 988                         self._downloader.trouble(u'ERROR: unable to extract video width')
 989                         return
 990                 yv_video_width = mobj.group(1)
 991
 992                 # Retrieve video playlist to extract media URL
 993                 # I'm not completely sure what all these options are, but we
 994                 # seem to need most of them, otherwise the server sends a 401.
 995                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 996                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 997                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 998                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 999                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000                 try:
1001                         self.report_download_webpage(video_id)
1002                         webpage = urllib2.urlopen(request).read()
1003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1005                         return
1006
1007                 # Extract media URL from playlist XML
1008                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011                         return
1012                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013                 video_url = unescapeHTML(video_url)
1014
1015                 return [{
1016                         'id':           video_id.decode('utf-8'),
1017                         'url':          video_url,
1018                         'uploader':     video_uploader,
1019                         'upload_date':  u'NA',
1020                         'title':        video_title,
1021                         'ext':          video_extension.decode('utf-8'),
1022                         'thumbnail':    video_thumbnail.decode('utf-8'),
1023                         'description':  video_description,
1024                         'thumbnail':    video_thumbnail,
1025                         'player_url':   None,
1026                 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030         """Information extractor for vimeo.com."""
1031
1032         # _VALID_URL matches Vimeo URLs
1033         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034         IE_NAME = u'vimeo'
1035
1036         def __init__(self, downloader=None):
1037                 InfoExtractor.__init__(self, downloader)
1038
1039         def report_download_webpage(self, video_id):
1040                 """Report webpage download."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043         def report_extraction(self, video_id):
1044                 """Report information extraction."""
1045                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047         def _real_extract(self, url, new_video=True):
1048                 # Extract ID from URL
1049                 mobj = re.match(self._VALID_URL, url)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052                         return
1053
1054                 video_id = mobj.group(1)
1055
1056                 # Retrieve video webpage to extract further information
1057                 request = urllib2.Request(url, None, std_headers)
1058                 try:
1059                         self.report_download_webpage(video_id)
1060                         webpage = urllib2.urlopen(request).read()
1061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1063                         return
1064
1065                 # Now we begin extracting as much information as we can from what we
1066                 # retrieved. First we extract the information common to all extractors,
1067                 # and latter we extract those that are Vimeo specific.
1068                 self.report_extraction(video_id)
1069
1070                 # Extract the config JSON
1071                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072                 try:
1073                         config = json.loads(config)
1074                 except:
1075                         self._downloader.trouble(u'ERROR: unable to extract info section')
1076                         return
1077
1078                 # Extract title
1079                 video_title = config["video"]["title"]
1080
1081                 # Extract uploader
1082                 video_uploader = config["video"]["owner"]["name"]
1083
1084                 # Extract video thumbnail
1085                 video_thumbnail = config["video"]["thumbnail"]
1086
1087                 # Extract video description
1088                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089                 if video_description: video_description = clean_html(video_description)
1090                 else: video_description = ''
1091
1092                 # Extract upload date
1093                 video_upload_date = u'NA'
1094                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095                 if mobj is not None:
1096                         video_upload_date = mobj.group(1)
1097
1098                 # Vimeo specific: extract request signature and timestamp
1099                 sig = config['request']['signature']
1100                 timestamp = config['request']['timestamp']
1101
1102                 # Vimeo specific: extract video codec and quality information
1103                 # First consider quality, then codecs, then take everything
1104                 # TODO bind to format param
1105                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106                 files = { 'hd': [], 'sd': [], 'other': []}
1107                 for codec_name, codec_extension in codecs:
1108                         if codec_name in config["video"]["files"]:
1109                                 if 'hd' in config["video"]["files"][codec_name]:
1110                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1111                                 elif 'sd' in config["video"]["files"][codec_name]:
1112                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1113                                 else:
1114                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116                 for quality in ('hd', 'sd', 'other'):
1117                         if len(files[quality]) > 0:
1118                                 video_quality = files[quality][0][2]
1119                                 video_codec = files[quality][0][0]
1120                                 video_extension = files[quality][0][1]
1121                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122                                 break
1123                 else:
1124                         self._downloader.trouble(u'ERROR: no known codec found')
1125                         return
1126
1127                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130                 return [{
1131                         'id':           video_id,
1132                         'url':          video_url,
1133                         'uploader':     video_uploader,
1134                         'upload_date':  video_upload_date,
1135                         'title':        video_title,
1136                         'ext':          video_extension,
1137                         'thumbnail':    video_thumbnail,
1138                         'description':  video_description,
1139                         'player_url':   None,
1140                 }]
1141
1142
1143 class ArteTvIE(InfoExtractor):
1144         """arte.tv information extractor."""
1145
1146         _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1147         _LIVE_URL = r'index-[0-9]+\.html$'
1148
1149         IE_NAME = u'arte.tv'
1150
1151         def __init__(self, downloader=None):
1152                 InfoExtractor.__init__(self, downloader)
1153
1154         def report_download_webpage(self, video_id):
1155                 """Report webpage download."""
1156                 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1157
1158         def report_extraction(self, video_id):
1159                 """Report information extraction."""
1160                 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1161
1162         def fetch_webpage(self, url):
1163                 self._downloader.increment_downloads()
1164                 request = urllib2.Request(url)
1165                 try:
1166                         self.report_download_webpage(url)
1167                         webpage = urllib2.urlopen(request).read()
1168                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1170                         return
1171                 except ValueError, err:
1172                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173                         return
1174                 return webpage
1175
1176         def grep_webpage(self, url, regex, regexFlags, matchTuples):
1177                 page = self.fetch_webpage(url)
1178                 mobj = re.search(regex, page, regexFlags)
1179                 info = {}
1180
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183                         return
1184
1185                 for (i, key, err) in matchTuples:
1186                         if mobj.group(i) is None:
1187                                 self._downloader.trouble(err)
1188                                 return
1189                         else:
1190                                 info[key] = mobj.group(i)
1191
1192                 return info
1193
1194         def extractLiveStream(self, url):
1195                 video_lang = url.split('/')[-4]
1196                 info = self.grep_webpage(
1197                         url,
1198                         r'src="(.*?/videothek_js.*?\.js)',
1199                         0,
1200                         [
1201                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1202                         ]
1203                 )
1204                 http_host = url.split('/')[2]
1205                 next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url')))
1206                 info = self.grep_webpage(
1207                         next_url,
1208                         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1209                                 '(http://.*?\.swf).*?' +
1210                                 '(rtmp://.*?)\'',
1211                         re.DOTALL,
1212                         [
1213                                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1214                                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1215                                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1216                         ]
1217                 )
1218                 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1219
1220         def extractPlus7Stream(self, url):
1221                 video_lang = url.split('/')[-3]
1222                 info = self.grep_webpage(
1223                         url,
1224                         r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1225                         0,
1226                         [
1227                                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1228                         ]
1229                 )
1230                 next_url = urllib.unquote(info.get('url'))
1231                 info = self.grep_webpage(
1232                         next_url,
1233                         r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1234                         0,
1235                         [
1236                                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1237                         ]
1238                 )
1239                 next_url = urllib.unquote(info.get('url'))
1240
1241                 info = self.grep_webpage(
1242                         next_url,
1243                         r'<video id="(.*?)".*?>.*?' +
1244                                 '<name>(.*?)</name>.*?' +
1245                                 '<dateVideo>(.*?)</dateVideo>.*?' +
1246                                 '<url quality="hd">(.*?)</url>',
1247                         re.DOTALL,
1248                         [
1249                                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1250                                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1251                                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1252                                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1253                         ]
1254                 )
1255
1256                 return {
1257                         'id':           info.get('id'),
1258                         'url':          urllib.unquote(info.get('url')),
1259                         'uploader':     u'arte.tv',
1260                         'upload_date':  info.get('date'),
1261                         'title':        info.get('title'),
1262                         'ext':          u'mp4',
1263                         'format':       u'NA',
1264                         'player_url':   None,
1265                 }
1266
1267         def _real_extract(self, url):
1268                 video_id = url.split('/')[-1]
1269                 self.report_extraction(video_id)
1270
1271                 if re.search(self._LIVE_URL, video_id) is not None:
1272                         self.extractLiveStream(url)
1273                         return
1274                 else:
1275                         info = self.extractPlus7Stream(url)
1276
1277                 return [info]
1278
1279
1280 class GenericIE(InfoExtractor):
1281         """Generic last-resort information extractor."""
1282
1283         _VALID_URL = r'.*'
1284         IE_NAME = u'generic'
1285
1286         def __init__(self, downloader=None):
1287                 InfoExtractor.__init__(self, downloader)
1288
1289         def report_download_webpage(self, video_id):
1290                 """Report webpage download."""
1291                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1292                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1293
1294         def report_extraction(self, video_id):
1295                 """Report information extraction."""
1296                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1297
1298         def report_following_redirect(self, new_url):
1299                 """Report information extraction."""
1300                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1301
1302         def _test_redirect(self, url):
1303                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1304                 class HeadRequest(urllib2.Request):
1305                         def get_method(self):
1306                                 return "HEAD"
1307
1308                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1309                         """
1310                         Subclass the HTTPRedirectHandler to make it use our
1311                         HeadRequest also on the redirected URL
1312                         """
1313                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1314                                 if code in (301, 302, 303, 307):
1315                                         newurl = newurl.replace(' ', '%20')
1316                                         newheaders = dict((k,v) for k,v in req.headers.items()
1317                                                                           if k.lower() not in ("content-length", "content-type"))
1318                                         return HeadRequest(newurl,
1319                                                                            headers=newheaders,
1320                                                                            origin_req_host=req.get_origin_req_host(),
1321                                                                            unverifiable=True)
1322                                 else:
1323                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1324
1325                 class HTTPMethodFallback(urllib2.BaseHandler):
1326                         """
1327                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1328                         """
1329                         def http_error_405(self, req, fp, code, msg, headers):
1330                                 fp.read()
1331                                 fp.close()
1332
1333                                 newheaders = dict((k,v) for k,v in req.headers.items()
1334                                                                   if k.lower() not in ("content-length", "content-type"))
1335                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1336                                                                                                  headers=newheaders,
1337                                                                                                  origin_req_host=req.get_origin_req_host(),
1338                                                                                                  unverifiable=True))
1339
1340                 # Build our opener
1341                 opener = urllib2.OpenerDirector()
1342                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1343                                                 HTTPMethodFallback, HEADRedirectHandler,
1344                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1345                         opener.add_handler(handler())
1346
1347                 response = opener.open(HeadRequest(url))
1348                 new_url = response.geturl()
1349
1350                 if url == new_url: return False
1351
1352                 self.report_following_redirect(new_url)
1353                 self._downloader.download([new_url])
1354                 return True
1355
1356         def _real_extract(self, url):
1357                 if self._test_redirect(url): return
1358
1359                 video_id = url.split('/')[-1]
1360                 request = urllib2.Request(url)
1361                 try:
1362                         self.report_download_webpage(video_id)
1363                         webpage = urllib2.urlopen(request).read()
1364                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1365                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
1366                         return
1367                 except ValueError, err:
1368                         # since this is the last-resort InfoExtractor, if
1369                         # this error is thrown, it'll be thrown here
1370                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371                         return
1372
1373                 self.report_extraction(video_id)
1374                 # Start with something easy: JW Player in SWFObject
1375                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1376                 if mobj is None:
1377                         # Broaden the search a little bit
1378                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1379                 if mobj is None:
1380                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381                         return
1382
1383                 # It's possible that one of the regexes
1384                 # matched, but returned an empty group:
1385                 if mobj.group(1) is None:
1386                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1387                         return
1388
1389                 video_url = urllib.unquote(mobj.group(1))
1390                 video_id = os.path.basename(video_url)
1391
1392                 # here's a fun little line of code for you:
1393                 video_extension = os.path.splitext(video_id)[1][1:]
1394                 video_id = os.path.splitext(video_id)[0]
1395
1396                 # it's tempting to parse this further, but you would
1397                 # have to take into account all the variations like
1398                 #   Video Title - Site Name
1399                 #   Site Name | Video Title
1400                 #   Video Title - Tagline | Site Name
1401                 # and so on and so forth; it's just not practical
1402                 mobj = re.search(r'<title>(.*)</title>', webpage)
1403                 if mobj is None:
1404                         self._downloader.trouble(u'ERROR: unable to extract title')
1405                         return
1406                 video_title = mobj.group(1).decode('utf-8')
1407
1408                 # video uploader is domain name
1409                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410                 if mobj is None:
1411                         self._downloader.trouble(u'ERROR: unable to extract title')
1412                         return
1413                 video_uploader = mobj.group(1).decode('utf-8')
1414
1415                 return [{
1416                         'id':           video_id.decode('utf-8'),
1417                         'url':          video_url.decode('utf-8'),
1418                         'uploader':     video_uploader,
1419                         'upload_date':  u'NA',
1420                         'title':        video_title,
1421                         'ext':          video_extension.decode('utf-8'),
1422                         'format':       u'NA',
1423                         'player_url':   None,
1424                 }]
1425
1426
1427 class YoutubeSearchIE(InfoExtractor):
1428         """Information Extractor for YouTube search queries."""
1429         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1430         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1431         _max_youtube_results = 1000
1432         IE_NAME = u'youtube:search'
1433
1434         def __init__(self, downloader=None):
1435                 InfoExtractor.__init__(self, downloader)
1436
1437         def report_download_page(self, query, pagenum):
1438                 """Report attempt to download search page with given number."""
1439                 query = query.decode(preferredencoding())
1440                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1441
1442         def _real_extract(self, query):
1443                 mobj = re.match(self._VALID_URL, query)
1444                 if mobj is None:
1445                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446                         return
1447
1448                 prefix, query = query.split(':')
1449                 prefix = prefix[8:]
1450                 query = query.encode('utf-8')
1451                 if prefix == '':
1452                         self._download_n_results(query, 1)
1453                         return
1454                 elif prefix == 'all':
1455                         self._download_n_results(query, self._max_youtube_results)
1456                         return
1457                 else:
1458                         try:
1459                                 n = int(prefix)
1460                                 if n <= 0:
1461                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462                                         return
1463                                 elif n > self._max_youtube_results:
1464                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1465                                         n = self._max_youtube_results
1466                                 self._download_n_results(query, n)
1467                                 return
1468                         except ValueError: # parsing prefix as integer fails
1469                                 self._download_n_results(query, 1)
1470                                 return
1471
1472         def _download_n_results(self, query, n):
1473                 """Downloads a specified number of results for a query"""
1474
1475                 video_ids = []
1476                 pagenum = 0
1477                 limit = n
1478
1479                 while (50 * pagenum) < limit:
1480                         self.report_download_page(query, pagenum+1)
1481                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1482                         request = urllib2.Request(result_url)
1483                         try:
1484                                 data = urllib2.urlopen(request).read()
1485                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % u(err))
1487                                 return
1488                         api_response = json.loads(data)['data']
1489
1490                         new_ids = list(video['id'] for video in api_response['items'])
1491                         video_ids += new_ids
1492
1493                         limit = min(n, api_response['totalItems'])
1494                         pagenum += 1
1495
1496                 if len(video_ids) > n:
1497                         video_ids = video_ids[:n]
1498                 for id in video_ids:
1499                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1500                 return
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504         """Information Extractor for Google Video search queries."""
1505         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509         _max_google_results = 1000
1510         IE_NAME = u'video.google:search'
1511
1512         def __init__(self, downloader=None):
1513                 InfoExtractor.__init__(self, downloader)
1514
1515         def report_download_page(self, query, pagenum):
1516                 """Report attempt to download playlist page with given number."""
1517                 query = query.decode(preferredencoding())
1518                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1519
1520         def _real_extract(self, query):
1521                 mobj = re.match(self._VALID_URL, query)
1522                 if mobj is None:
1523                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1524                         return
1525
1526                 prefix, query = query.split(':')
1527                 prefix = prefix[8:]
1528                 query = query.encode('utf-8')
1529                 if prefix == '':
1530                         self._download_n_results(query, 1)
1531                         return
1532                 elif prefix == 'all':
1533                         self._download_n_results(query, self._max_google_results)
1534                         return
1535                 else:
1536                         try:
1537                                 n = int(prefix)
1538                                 if n <= 0:
1539                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1540                                         return
1541                                 elif n > self._max_google_results:
1542                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1543                                         n = self._max_google_results
1544                                 self._download_n_results(query, n)
1545                                 return
1546                         except ValueError: # parsing prefix as integer fails
1547                                 self._download_n_results(query, 1)
1548                                 return
1549
1550         def _download_n_results(self, query, n):
1551                 """Downloads a specified number of results for a query"""
1552
1553                 video_ids = []
1554                 pagenum = 0
1555
1556                 while True:
1557                         self.report_download_page(query, pagenum)
1558                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1559                         request = urllib2.Request(result_url)
1560                         try:
1561                                 page = urllib2.urlopen(request).read()
1562                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1564                                 return
1565
1566                         # Extract video identifiers
1567                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1568                                 video_id = mobj.group(1)
1569                                 if video_id not in video_ids:
1570                                         video_ids.append(video_id)
1571                                         if len(video_ids) == n:
1572                                                 # Specified n videos reached
1573                                                 for id in video_ids:
1574                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575                                                 return
1576
1577                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578                                 for id in video_ids:
1579                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580                                 return
1581
1582                         pagenum = pagenum + 1
1583
1584
1585 class YahooSearchIE(InfoExtractor):
1586         """Information Extractor for Yahoo! Video search queries."""
1587         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1588         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1589         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1590         _MORE_PAGES_INDICATOR = r'\s*Next'
1591         _max_yahoo_results = 1000
1592         IE_NAME = u'video.yahoo:search'
1593
1594         def __init__(self, downloader=None):
1595                 InfoExtractor.__init__(self, downloader)
1596
1597         def report_download_page(self, query, pagenum):
1598                 """Report attempt to download playlist page with given number."""
1599                 query = query.decode(preferredencoding())
1600                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1601
1602         def _real_extract(self, query):
1603                 mobj = re.match(self._VALID_URL, query)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1606                         return
1607
1608                 prefix, query = query.split(':')
1609                 prefix = prefix[8:]
1610                 query = query.encode('utf-8')
1611                 if prefix == '':
1612                         self._download_n_results(query, 1)
1613                         return
1614                 elif prefix == 'all':
1615                         self._download_n_results(query, self._max_yahoo_results)
1616                         return
1617                 else:
1618                         try:
1619                                 n = int(prefix)
1620                                 if n <= 0:
1621                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1622                                         return
1623                                 elif n > self._max_yahoo_results:
1624                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1625                                         n = self._max_yahoo_results
1626                                 self._download_n_results(query, n)
1627                                 return
1628                         except ValueError: # parsing prefix as integer fails
1629                                 self._download_n_results(query, 1)
1630                                 return
1631
1632         def _download_n_results(self, query, n):
1633                 """Downloads a specified number of results for a query"""
1634
1635                 video_ids = []
1636                 already_seen = set()
1637                 pagenum = 1
1638
1639                 while True:
1640                         self.report_download_page(query, pagenum)
1641                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1642                         request = urllib2.Request(result_url)
1643                         try:
1644                                 page = urllib2.urlopen(request).read()
1645                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1647                                 return
1648
1649                         # Extract video identifiers
1650                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651                                 video_id = mobj.group(1)
1652                                 if video_id not in already_seen:
1653                                         video_ids.append(video_id)
1654                                         already_seen.add(video_id)
1655                                         if len(video_ids) == n:
1656                                                 # Specified n videos reached
1657                                                 for id in video_ids:
1658                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659                                                 return
1660
1661                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662                                 for id in video_ids:
1663                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664                                 return
1665
1666                         pagenum = pagenum + 1
1667
1668
1669 class YoutubePlaylistIE(InfoExtractor):
1670         """Information Extractor for YouTube playlists."""
1671
1672         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1673         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1674         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1675         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1676         IE_NAME = u'youtube:playlist'
1677
1678         def __init__(self, downloader=None):
1679                 InfoExtractor.__init__(self, downloader)
1680
1681         def report_download_page(self, playlist_id, pagenum):
1682                 """Report attempt to download playlist page with given number."""
1683                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1684
1685         def _real_extract(self, url):
1686                 # Extract playlist id
1687                 mobj = re.match(self._VALID_URL, url)
1688                 if mobj is None:
1689                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690                         return
1691
1692                 # Single video case
1693                 if mobj.group(3) is not None:
1694                         self._downloader.download([mobj.group(3)])
1695                         return
1696
1697                 # Download playlist pages
1698                 # prefix is 'p' as default for playlists but there are other types that need extra care
1699                 playlist_prefix = mobj.group(1)
1700                 if playlist_prefix == 'a':
1701                         playlist_access = 'artist'
1702                 else:
1703                         playlist_prefix = 'p'
1704                         playlist_access = 'view_play_list'
1705                 playlist_id = mobj.group(2)
1706                 video_ids = []
1707                 pagenum = 1
1708
1709                 while True:
1710                         self.report_download_page(playlist_id, pagenum)
1711                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1712                         request = urllib2.Request(url)
1713                         try:
1714                                 page = urllib2.urlopen(request).read()
1715                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1716                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1717                                 return
1718
1719                         # Extract video identifiers
1720                         ids_in_page = []
1721                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1722                                 if mobj.group(1) not in ids_in_page:
1723                                         ids_in_page.append(mobj.group(1))
1724                         video_ids.extend(ids_in_page)
1725
1726                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1727                                 break
1728                         pagenum = pagenum + 1
1729
1730                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1731                 playlistend = self._downloader.params.get('playlistend', -1)
1732                 if playlistend == -1:
1733                         video_ids = video_ids[playliststart:]
1734                 else:
1735                         video_ids = video_ids[playliststart:playlistend]
1736
1737                 for id in video_ids:
1738                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739                 return
1740
1741
1742 class YoutubeChannelIE(InfoExtractor):
1743         """Information Extractor for YouTube channels."""
1744
1745         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1748         IE_NAME = u'youtube:channel'
1749
1750         def report_download_page(self, channel_id, pagenum):
1751                 """Report attempt to download channel page with given number."""
1752                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753
1754         def _real_extract(self, url):
1755                 # Extract channel id
1756                 mobj = re.match(self._VALID_URL, url)
1757                 if mobj is None:
1758                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759                         return
1760
1761                 # Download channel pages
1762                 channel_id = mobj.group(1)
1763                 video_ids = []
1764                 pagenum = 1
1765
1766                 while True:
1767                         self.report_download_page(channel_id, pagenum)
1768                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1769                         request = urllib2.Request(url)
1770                         try:
1771                                 page = urllib2.urlopen(request).read()
1772                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1773                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1774                                 return
1775
1776                         # Extract video identifiers
1777                         ids_in_page = []
1778                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779                                 if mobj.group(1) not in ids_in_page:
1780                                         ids_in_page.append(mobj.group(1))
1781                         video_ids.extend(ids_in_page)
1782
1783                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1784                                 break
1785                         pagenum = pagenum + 1
1786
1787                 for id in video_ids:
1788                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1789                 return
1790
1791
1792 class YoutubeUserIE(InfoExtractor):
1793         """Information Extractor for YouTube users."""
1794
1795         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1796         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1797         _GDATA_PAGE_SIZE = 50
1798         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1799         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1800         IE_NAME = u'youtube:user'
1801
1802         def __init__(self, downloader=None):
1803                 InfoExtractor.__init__(self, downloader)
1804
1805         def report_download_page(self, username, start_index):
1806                 """Report attempt to download user page."""
1807                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1808                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1809
1810         def _real_extract(self, url):
1811                 # Extract username
1812                 mobj = re.match(self._VALID_URL, url)
1813                 if mobj is None:
1814                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815                         return
1816
1817                 username = mobj.group(1)
1818
1819                 # Download video ids using YouTube Data API. Result size per
1820                 # query is limited (currently to 50 videos) so we need to query
1821                 # page by page until there are no video ids - it means we got
1822                 # all of them.
1823
1824                 video_ids = []
1825                 pagenum = 0
1826
1827                 while True:
1828                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1829                         self.report_download_page(username, start_index)
1830
1831                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1832
1833                         try:
1834                                 page = urllib2.urlopen(request).read()
1835                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1836                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1837                                 return
1838
1839                         # Extract video identifiers
1840                         ids_in_page = []
1841
1842                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1843                                 if mobj.group(1) not in ids_in_page:
1844                                         ids_in_page.append(mobj.group(1))
1845
1846                         video_ids.extend(ids_in_page)
1847
1848                         # A little optimization - if current page is not
1849                         # "full", ie. does not contain PAGE_SIZE video ids then
1850                         # we can assume that this page is the last one - there
1851                         # are no more ids on further pages - no need to query
1852                         # again.
1853
1854                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1855                                 break
1856
1857                         pagenum += 1
1858
1859                 all_ids_count = len(video_ids)
1860                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1861                 playlistend = self._downloader.params.get('playlistend', -1)
1862
1863                 if playlistend == -1:
1864                         video_ids = video_ids[playliststart:]
1865                 else:
1866                         video_ids = video_ids[playliststart:playlistend]
1867
1868                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1869                                 (username, all_ids_count, len(video_ids)))
1870
1871                 for video_id in video_ids:
1872                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1873
1874
1875 class BlipTVUserIE(InfoExtractor):
1876         """Information Extractor for blip.tv users."""
1877
1878         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1879         _PAGE_SIZE = 12
1880         IE_NAME = u'blip.tv:user'
1881
1882         def __init__(self, downloader=None):
1883                 InfoExtractor.__init__(self, downloader)
1884
1885         def report_download_page(self, username, pagenum):
1886                 """Report attempt to download user page."""
1887                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1888                                 (self.IE_NAME, username, pagenum))
1889
1890         def _real_extract(self, url):
1891                 # Extract username
1892                 mobj = re.match(self._VALID_URL, url)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895                         return
1896
1897                 username = mobj.group(1)
1898
1899                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1900
1901                 request = urllib2.Request(url)
1902
1903                 try:
1904                         page = urllib2.urlopen(request).read().decode('utf-8')
1905                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1906                         page_base = page_base % mobj.group(1)
1907                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1908                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
1909                         return
1910
1911
1912                 # Download video ids using BlipTV Ajax calls. Result size per
1913                 # query is limited (currently to 12 videos) so we need to query
1914                 # page by page until there are no video ids - it means we got
1915                 # all of them.
1916
1917                 video_ids = []
1918                 pagenum = 1
1919
1920                 while True:
1921                         self.report_download_page(username, pagenum)
1922
1923                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1924
1925                         try:
1926                                 page = urllib2.urlopen(request).read().decode('utf-8')
1927                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1928                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1929                                 return
1930
1931                         # Extract video identifiers
1932                         ids_in_page = []
1933
1934                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1935                                 if mobj.group(1) not in ids_in_page:
1936                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1937
1938                         video_ids.extend(ids_in_page)
1939
1940                         # A little optimization - if current page is not
1941                         # "full", ie. does not contain PAGE_SIZE video ids then
1942                         # we can assume that this page is the last one - there
1943                         # are no more ids on further pages - no need to query
1944                         # again.
1945
1946                         if len(ids_in_page) < self._PAGE_SIZE:
1947                                 break
1948
1949                         pagenum += 1
1950
1951                 all_ids_count = len(video_ids)
1952                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1953                 playlistend = self._downloader.params.get('playlistend', -1)
1954
1955                 if playlistend == -1:
1956                         video_ids = video_ids[playliststart:]
1957                 else:
1958                         video_ids = video_ids[playliststart:playlistend]
1959
1960                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1961                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1962
1963                 for video_id in video_ids:
1964                         self._downloader.download([u'http://blip.tv/'+video_id])
1965
1966
1967 class DepositFilesIE(InfoExtractor):
1968         """Information extractor for depositfiles.com"""
1969
1970         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971         IE_NAME = u'DepositFiles'
1972
1973         def __init__(self, downloader=None):
1974                 InfoExtractor.__init__(self, downloader)
1975
1976         def report_download_webpage(self, file_id):
1977                 """Report webpage download."""
1978                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1979
1980         def report_extraction(self, file_id):
1981                 """Report information extraction."""
1982                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1983
1984         def _real_extract(self, url):
1985                 file_id = url.split('/')[-1]
1986                 # Rebuild url in english locale
1987                 url = 'http://depositfiles.com/en/files/' + file_id
1988
1989                 # Retrieve file webpage with 'Free download' button pressed
1990                 free_download_indication = { 'gateway_result' : '1' }
1991                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1992                 try:
1993                         self.report_download_webpage(file_id)
1994                         webpage = urllib2.urlopen(request).read()
1995                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1996                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % u(err))
1997                         return
1998
1999                 # Search for the real file URL
2000                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2001                 if (mobj is None) or (mobj.group(1) is None):
2002                         # Try to figure out reason of the error.
2003                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2004                         if (mobj is not None) and (mobj.group(1) is not None):
2005                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2006                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2007                         else:
2008                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2009                         return
2010
2011                 file_url = mobj.group(1)
2012                 file_extension = os.path.splitext(file_url)[1][1:]
2013
2014                 # Search for file title
2015                 mobj = re.search(r'<b title="(.*?)">', webpage)
2016                 if mobj is None:
2017                         self._downloader.trouble(u'ERROR: unable to extract title')
2018                         return
2019                 file_title = mobj.group(1).decode('utf-8')
2020
2021                 return [{
2022                         'id':           file_id.decode('utf-8'),
2023                         'url':          file_url.decode('utf-8'),
2024                         'uploader':     u'NA',
2025                         'upload_date':  u'NA',
2026                         'title':        file_title,
2027                         'ext':          file_extension.decode('utf-8'),
2028                         'format':       u'NA',
2029                         'player_url':   None,
2030                 }]
2031
2032
2033 class FacebookIE(InfoExtractor):
2034         """Information Extractor for Facebook"""
2035
2036         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2037         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2038         _NETRC_MACHINE = 'facebook'
2039         _available_formats = ['video', 'highqual', 'lowqual']
2040         _video_extensions = {
2041                 'video': 'mp4',
2042                 'highqual': 'mp4',
2043                 'lowqual': 'mp4',
2044         }
2045         IE_NAME = u'facebook'
2046
2047         def __init__(self, downloader=None):
2048                 InfoExtractor.__init__(self, downloader)
2049
2050         def _reporter(self, message):
2051                 """Add header and report message."""
2052                 self._downloader.to_screen(u'[facebook] %s' % message)
2053
2054         def report_login(self):
2055                 """Report attempt to log in."""
2056                 self._reporter(u'Logging in')
2057
2058         def report_video_webpage_download(self, video_id):
2059                 """Report attempt to download video webpage."""
2060                 self._reporter(u'%s: Downloading video webpage' % video_id)
2061
2062         def report_information_extraction(self, video_id):
2063                 """Report attempt to extract video information."""
2064                 self._reporter(u'%s: Extracting video information' % video_id)
2065
2066         def _parse_page(self, video_webpage):
2067                 """Extract video information from page"""
2068                 # General data
2069                 data = {'title': r'\("video_title", "(.*?)"\)',
2070                         'description': r'<div class="datawrap">(.*?)</div>',
2071                         'owner': r'\("video_owner_name", "(.*?)"\)',
2072                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2073                         }
2074                 video_info = {}
2075                 for piece in data.keys():
2076                         mobj = re.search(data[piece], video_webpage)
2077                         if mobj is not None:
2078                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2079
2080                 # Video urls
2081                 video_urls = {}
2082                 for fmt in self._available_formats:
2083                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2084                         if mobj is not None:
2085                                 # URL is in a Javascript segment inside an escaped Unicode format within
2086                                 # the generally utf-8 page
2087                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2088                 video_info['video_urls'] = video_urls
2089
2090                 return video_info
2091
2092         def _real_initialize(self):
2093                 if self._downloader is None:
2094                         return
2095
2096                 useremail = None
2097                 password = None
2098                 downloader_params = self._downloader.params
2099
2100                 # Attempt to use provided username and password or .netrc data
2101                 if downloader_params.get('username', None) is not None:
2102                         useremail = downloader_params['username']
2103                         password = downloader_params['password']
2104                 elif downloader_params.get('usenetrc', False):
2105                         try:
2106                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2107                                 if info is not None:
2108                                         useremail = info[0]
2109                                         password = info[2]
2110                                 else:
2111                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2112                         except (IOError, netrc.NetrcParseError), err:
2113                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err))
2114                                 return
2115
2116                 if useremail is None:
2117                         return
2118
2119                 # Log in
2120                 login_form = {
2121                         'email': useremail,
2122                         'pass': password,
2123                         'login': 'Log+In'
2124                         }
2125                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2126                 try:
2127                         self.report_login()
2128                         login_results = urllib2.urlopen(request).read()
2129                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2130                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2131                                 return
2132                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err))
2134                         return
2135
2136         def _real_extract(self, url):
2137                 mobj = re.match(self._VALID_URL, url)
2138                 if mobj is None:
2139                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2140                         return
2141                 video_id = mobj.group('ID')
2142
2143                 # Get video webpage
2144                 self.report_video_webpage_download(video_id)
2145                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2146                 try:
2147                         page = urllib2.urlopen(request)
2148                         video_webpage = page.read()
2149                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2151                         return
2152
2153                 # Start extracting information
2154                 self.report_information_extraction(video_id)
2155
2156                 # Extract information
2157                 video_info = self._parse_page(video_webpage)
2158
2159                 # uploader
2160                 if 'owner' not in video_info:
2161                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2162                         return
2163                 video_uploader = video_info['owner']
2164
2165                 # title
2166                 if 'title' not in video_info:
2167                         self._downloader.trouble(u'ERROR: unable to extract video title')
2168                         return
2169                 video_title = video_info['title']
2170                 video_title = video_title.decode('utf-8')
2171
2172                 # thumbnail image
2173                 if 'thumbnail' not in video_info:
2174                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2175                         video_thumbnail = ''
2176                 else:
2177                         video_thumbnail = video_info['thumbnail']
2178
2179                 # upload date
2180                 upload_date = u'NA'
2181                 if 'upload_date' in video_info:
2182                         upload_time = video_info['upload_date']
2183                         timetuple = email.utils.parsedate_tz(upload_time)
2184                         if timetuple is not None:
2185                                 try:
2186                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2187                                 except:
2188                                         pass
2189
2190                 # description
2191                 video_description = video_info.get('description', 'No description available.')
2192
2193                 url_map = video_info['video_urls']
2194                 if len(url_map.keys()) > 0:
2195                         # Decide which formats to download
2196                         req_format = self._downloader.params.get('format', None)
2197                         format_limit = self._downloader.params.get('format_limit', None)
2198
2199                         if format_limit is not None and format_limit in self._available_formats:
2200                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2201                         else:
2202                                 format_list = self._available_formats
2203                         existing_formats = [x for x in format_list if x in url_map]
2204                         if len(existing_formats) == 0:
2205                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2206                                 return
2207                         if req_format is None:
2208                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2209                         elif req_format == 'worst':
2210                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2211                         elif req_format == '-1':
2212                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2213                         else:
2214                                 # Specific format
2215                                 if req_format not in url_map:
2216                                         self._downloader.trouble(u'ERROR: requested format not available')
2217                                         return
2218                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2219
2220                 results = []
2221                 for format_param, video_real_url in video_url_list:
2222                         # Extension
2223                         video_extension = self._video_extensions.get(format_param, 'mp4')
2224
2225                         results.append({
2226                                 'id':           video_id.decode('utf-8'),
2227                                 'url':          video_real_url.decode('utf-8'),
2228                                 'uploader':     video_uploader.decode('utf-8'),
2229                                 'upload_date':  upload_date,
2230                                 'title':        video_title,
2231                                 'ext':          video_extension.decode('utf-8'),
2232                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2233                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2234                                 'description':  video_description.decode('utf-8'),
2235                                 'player_url':   None,
2236                         })
2237                 return results
2238
2239 class BlipTVIE(InfoExtractor):
2240         """Information extractor for blip.tv"""
2241
2242         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2243         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2244         IE_NAME = u'blip.tv'
2245
2246         def report_extraction(self, file_id):
2247                 """Report information extraction."""
2248                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2249
2250         def report_direct_download(self, title):
2251                 """Report information extraction."""
2252                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2253
2254         def _real_extract(self, url):
2255                 mobj = re.match(self._VALID_URL, url)
2256                 if mobj is None:
2257                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2258                         return
2259
2260                 if '?' in url:
2261                         cchar = '&'
2262                 else:
2263                         cchar = '?'
2264                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2265                 request = urllib2.Request(json_url.encode('utf-8'))
2266                 self.report_extraction(mobj.group(1))
2267                 info = None
2268                 try:
2269                         urlh = urllib2.urlopen(request)
2270                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2271                                 basename = url.split('/')[-1]
2272                                 title,ext = os.path.splitext(basename)
2273                                 title = title.decode('UTF-8')
2274                                 ext = ext.replace('.', '')
2275                                 self.report_direct_download(title)
2276                                 info = {
2277                                         'id': title,
2278                                         'url': url,
2279                                         'title': title,
2280                                         'ext': ext,
2281                                         'urlhandle': urlh
2282                                 }
2283                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2284                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err))
2285                         return
2286                 if info is None: # Regular URL
2287                         try:
2288                                 json_code = urlh.read()
2289                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2290                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % u(err))
2291                                 return
2292
2293                         try:
2294                                 json_data = json.loads(json_code)
2295                                 if 'Post' in json_data:
2296                                         data = json_data['Post']
2297                                 else:
2298                                         data = json_data
2299
2300                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2301                                 video_url = data['media']['url']
2302                                 umobj = re.match(self._URL_EXT, video_url)
2303                                 if umobj is None:
2304                                         raise ValueError('Can not determine filename extension')
2305                                 ext = umobj.group(1)
2306
2307                                 info = {
2308                                         'id': data['item_id'],
2309                                         'url': video_url,
2310                                         'uploader': data['display_name'],
2311                                         'upload_date': upload_date,
2312                                         'title': data['title'],
2313                                         'ext': ext,
2314                                         'format': data['media']['mimeType'],
2315                                         'thumbnail': data['thumbnailUrl'],
2316                                         'description': data['description'],
2317                                         'player_url': data['embedUrl']
2318                                 }
2319                         except (ValueError,KeyError), err:
2320                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2321                                 return
2322
2323                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2324                 return [info]
2325
2326
2327 class MyVideoIE(InfoExtractor):
2328         """Information Extractor for myvideo.de."""
2329
2330         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2331         IE_NAME = u'myvideo'
2332
2333         def __init__(self, downloader=None):
2334                 InfoExtractor.__init__(self, downloader)
2335
2336         def report_download_webpage(self, video_id):
2337                 """Report webpage download."""
2338                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2339
2340         def report_extraction(self, video_id):
2341                 """Report information extraction."""
2342                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2343
2344         def _real_extract(self,url):
2345                 mobj = re.match(self._VALID_URL, url)
2346                 if mobj is None:
2347                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2348                         return
2349
2350                 video_id = mobj.group(1)
2351
2352                 # Get video webpage
2353                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2354                 try:
2355                         self.report_download_webpage(video_id)
2356                         webpage = urllib2.urlopen(request).read()
2357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2358                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
2359                         return
2360
2361                 self.report_extraction(video_id)
2362                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2363                                  webpage)
2364                 if mobj is None:
2365                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2366                         return
2367                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2368
2369                 mobj = re.search('<title>([^<]+)</title>', webpage)
2370                 if mobj is None:
2371                         self._downloader.trouble(u'ERROR: unable to extract title')
2372                         return
2373
2374                 video_title = mobj.group(1)
2375
2376                 return [{
2377                         'id':           video_id,
2378                         'url':          video_url,
2379                         'uploader':     u'NA',
2380                         'upload_date':  u'NA',
2381                         'title':        video_title,
2382                         'ext':          u'flv',
2383                         'format':       u'NA',
2384                         'player_url':   None,
2385                 }]
2386
2387 class ComedyCentralIE(InfoExtractor):
2388         """Information extractor for The Daily Show and Colbert Report """
2389
2390         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2391         IE_NAME = u'comedycentral'
2392
2393         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2394
2395         _video_extensions = {
2396                 '3500': 'mp4',
2397                 '2200': 'mp4',
2398                 '1700': 'mp4',
2399                 '1200': 'mp4',
2400                 '750': 'mp4',
2401                 '400': 'mp4',
2402         }
2403         _video_dimensions = {
2404                 '3500': '1280x720',
2405                 '2200': '960x540',
2406                 '1700': '768x432',
2407                 '1200': '640x360',
2408                 '750': '512x288',
2409                 '400': '384x216',
2410         }
2411
2412         def report_extraction(self, episode_id):
2413                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2414
2415         def report_config_download(self, episode_id):
2416                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2417
2418         def report_index_download(self, episode_id):
2419                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2420
2421         def report_player_url(self, episode_id):
2422                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2423
2424
2425         def _print_formats(self, formats):
2426                 print('Available formats:')
2427                 for x in formats:
2428                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2429
2430
2431         def _real_extract(self, url):
2432                 mobj = re.match(self._VALID_URL, url)
2433                 if mobj is None:
2434                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435                         return
2436
2437                 if mobj.group('shortname'):
2438                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2439                                 url = u'http://www.thedailyshow.com/full-episodes/'
2440                         else:
2441                                 url = u'http://www.colbertnation.com/full-episodes/'
2442                         mobj = re.match(self._VALID_URL, url)
2443                         assert mobj is not None
2444
2445                 dlNewest = not mobj.group('episode')
2446                 if dlNewest:
2447                         epTitle = mobj.group('showname')
2448                 else:
2449                         epTitle = mobj.group('episode')
2450
2451                 req = urllib2.Request(url)
2452                 self.report_extraction(epTitle)
2453                 try:
2454                         htmlHandle = urllib2.urlopen(req)
2455                         html = htmlHandle.read()
2456                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2458                         return
2459                 if dlNewest:
2460                         url = htmlHandle.geturl()
2461                         mobj = re.match(self._VALID_URL, url)
2462                         if mobj is None:
2463                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2464                                 return
2465                         if mobj.group('episode') == '':
2466                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2467                                 return
2468                         epTitle = mobj.group('episode')
2469
2470                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2471
2472                 if len(mMovieParams) == 0:
2473                         # The Colbert Report embeds the information in a without
2474                         # a URL prefix; so extract the alternate reference
2475                         # and then add the URL prefix manually.
2476
2477                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2478                         if len(altMovieParams) == 0:
2479                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2480                                 return
2481                         else:
2482                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2483
2484                 playerUrl_raw = mMovieParams[0][0]
2485                 self.report_player_url(epTitle)
2486                 try:
2487                         urlHandle = urllib2.urlopen(playerUrl_raw)
2488                         playerUrl = urlHandle.geturl()
2489                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2490                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + u(err))
2491                         return
2492
2493                 uri = mMovieParams[0][1]
2494                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2495                 self.report_index_download(epTitle)
2496                 try:
2497                         indexXml = urllib2.urlopen(indexUrl).read()
2498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2499                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + u(err))
2500                         return
2501
2502                 results = []
2503
2504                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2505                 itemEls = idoc.findall('.//item')
2506                 for itemEl in itemEls:
2507                         mediaId = itemEl.findall('./guid')[0].text
2508                         shortMediaId = mediaId.split(':')[-1]
2509                         showId = mediaId.split(':')[-2].replace('.com', '')
2510                         officialTitle = itemEl.findall('./title')[0].text
2511                         officialDate = itemEl.findall('./pubDate')[0].text
2512
2513                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2514                                                 urllib.urlencode({'uri': mediaId}))
2515                         configReq = urllib2.Request(configUrl)
2516                         self.report_config_download(epTitle)
2517                         try:
2518                                 configXml = urllib2.urlopen(configReq).read()
2519                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2520                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % u(err))
2521                                 return
2522
2523                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2524                         turls = []
2525                         for rendition in cdoc.findall('.//rendition'):
2526                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2527                                 turls.append(finfo)
2528
2529                         if len(turls) == 0:
2530                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2531                                 continue
2532
2533                         if self._downloader.params.get('listformats', None):
2534                                 self._print_formats([i[0] for i in turls])
2535                                 return
2536
2537                         # For now, just pick the highest bitrate
2538                         format,video_url = turls[-1]
2539
2540                         # Get the format arg from the arg stream
2541                         req_format = self._downloader.params.get('format', None)
2542
2543                         # Select format if we can find one
2544                         for f,v in turls:
2545                                 if f == req_format:
2546                                         format, video_url = f, v
2547                                         break
2548
2549                         # Patch to download from alternative CDN, which does not
2550                         # break on current RTMPDump builds
2551                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2552                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2553
2554                         if video_url.startswith(broken_cdn):
2555                                 video_url = video_url.replace(broken_cdn, better_cdn)
2556
2557                         effTitle = showId + u'-' + epTitle
2558                         info = {
2559                                 'id': shortMediaId,
2560                                 'url': video_url,
2561                                 'uploader': showId,
2562                                 'upload_date': officialDate,
2563                                 'title': effTitle,
2564                                 'ext': 'mp4',
2565                                 'format': format,
2566                                 'thumbnail': None,
2567                                 'description': officialTitle,
2568                                 'player_url': None #playerUrl
2569                         }
2570
2571                         results.append(info)
2572
2573                 return results
2574
2575
2576 class EscapistIE(InfoExtractor):
2577         """Information extractor for The Escapist """
2578
2579         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2580         IE_NAME = u'escapist'
2581
2582         def report_extraction(self, showName):
2583                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2584
2585         def report_config_download(self, showName):
2586                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2587
2588         def _real_extract(self, url):
2589                 mobj = re.match(self._VALID_URL, url)
2590                 if mobj is None:
2591                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2592                         return
2593                 showName = mobj.group('showname')
2594                 videoId = mobj.group('episode')
2595
2596                 self.report_extraction(showName)
2597                 try:
2598                         webPage = urllib2.urlopen(url)
2599                         webPageBytes = webPage.read()
2600                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2601                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2602                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2603                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + u(err))
2604                         return
2605
2606                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2607                 description = unescapeHTML(descMatch.group(1))
2608                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2609                 imgUrl = unescapeHTML(imgMatch.group(1))
2610                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2611                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2612                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2613                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2614
2615                 self.report_config_download(showName)
2616                 try:
2617                         configJSON = urllib2.urlopen(configUrl).read()
2618                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2619                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + u(err))
2620                         return
2621
2622                 # Technically, it's JavaScript, not JSON
2623                 configJSON = configJSON.replace("'", '"')
2624
2625                 try:
2626                         config = json.loads(configJSON)
2627                 except (ValueError,), err:
2628                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + u(err))
2629                         return
2630
2631                 playlist = config['playlist']
2632                 videoUrl = playlist[1]['url']
2633
2634                 info = {
2635                         'id': videoId,
2636                         'url': videoUrl,
2637                         'uploader': showName,
2638                         'upload_date': None,
2639                         'title': showName,
2640                         'ext': 'flv',
2641                         'format': 'flv',
2642                         'thumbnail': imgUrl,
2643                         'description': description,
2644                         'player_url': playerUrl,
2645                 }
2646
2647                 return [info]
2648
2649
2650 class CollegeHumorIE(InfoExtractor):
2651         """Information extractor for collegehumor.com"""
2652
2653         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2654         IE_NAME = u'collegehumor'
2655
2656         def report_webpage(self, video_id):
2657                 """Report information extraction."""
2658                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2659
2660         def report_extraction(self, video_id):
2661                 """Report information extraction."""
2662                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2663
2664         def _real_extract(self, url):
2665                 mobj = re.match(self._VALID_URL, url)
2666                 if mobj is None:
2667                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2668                         return
2669                 video_id = mobj.group('videoid')
2670
2671                 self.report_webpage(video_id)
2672                 request = urllib2.Request(url)
2673                 try:
2674                         webpage = urllib2.urlopen(request).read()
2675                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2676                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2677                         return
2678
2679                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2680                 if m is None:
2681                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2682                         return
2683                 internal_video_id = m.group('internalvideoid')
2684
2685                 info = {
2686                         'id': video_id,
2687                         'internal_id': internal_video_id,
2688                 }
2689
2690                 self.report_extraction(video_id)
2691                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2692                 try:
2693                         metaXml = urllib2.urlopen(xmlUrl).read()
2694                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2695                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
2696                         return
2697
2698                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2699                 try:
2700                         videoNode = mdoc.findall('./video')[0]
2701                         info['description'] = videoNode.findall('./description')[0].text
2702                         info['title'] = videoNode.findall('./caption')[0].text
2703                         info['url'] = videoNode.findall('./file')[0].text
2704                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2705                         info['ext'] = info['url'].rpartition('.')[2]
2706                         info['format'] = info['ext']
2707                 except IndexError:
2708                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2709                         return
2710
2711                 return [info]
2712
2713
2714 class XVideosIE(InfoExtractor):
2715         """Information extractor for xvideos.com"""
2716
2717         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2718         IE_NAME = u'xvideos'
2719
2720         def report_webpage(self, video_id):
2721                 """Report information extraction."""
2722                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2723
2724         def report_extraction(self, video_id):
2725                 """Report information extraction."""
2726                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2727
2728         def _real_extract(self, url):
2729                 mobj = re.match(self._VALID_URL, url)
2730                 if mobj is None:
2731                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2732                         return
2733                 video_id = mobj.group(1).decode('utf-8')
2734
2735                 self.report_webpage(video_id)
2736
2737                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2738                 try:
2739                         webpage = urllib2.urlopen(request).read()
2740                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2741                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2742                         return
2743
2744                 self.report_extraction(video_id)
2745
2746
2747                 # Extract video URL
2748                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2749                 if mobj is None:
2750                         self._downloader.trouble(u'ERROR: unable to extract video url')
2751                         return
2752                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2753
2754
2755                 # Extract title
2756                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2757                 if mobj is None:
2758                         self._downloader.trouble(u'ERROR: unable to extract video title')
2759                         return
2760                 video_title = mobj.group(1).decode('utf-8')
2761
2762
2763                 # Extract video thumbnail
2764                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2765                 if mobj is None:
2766                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2767                         return
2768                 video_thumbnail = mobj.group(0).decode('utf-8')
2769
2770                 info = {
2771                         'id': video_id,
2772                         'url': video_url,
2773                         'uploader': None,
2774                         'upload_date': None,
2775                         'title': video_title,
2776                         'ext': 'flv',
2777                         'format': 'flv',
2778                         'thumbnail': video_thumbnail,
2779                         'description': None,
2780                         'player_url': None,
2781                 }
2782
2783                 return [info]
2784
2785
2786 class SoundcloudIE(InfoExtractor):
2787         """Information extractor for soundcloud.com
2788            To access the media, the uid of the song and a stream token
2789            must be extracted from the page source and the script must make
2790            a request to media.soundcloud.com/crossdomain.xml. Then
2791            the media can be grabbed by requesting from an url composed
2792            of the stream token and uid
2793          """
2794
2795         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796         IE_NAME = u'soundcloud'
2797
2798         def __init__(self, downloader=None):
2799                 InfoExtractor.__init__(self, downloader)
2800
2801         def report_webpage(self, video_id):
2802                 """Report information extraction."""
2803                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2804
2805         def report_extraction(self, video_id):
2806                 """Report information extraction."""
2807                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2808
2809         def _real_extract(self, url):
2810                 mobj = re.match(self._VALID_URL, url)
2811                 if mobj is None:
2812                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813                         return
2814
2815                 # extract uploader (which is in the url)
2816                 uploader = mobj.group(1).decode('utf-8')
2817                 # extract simple title (uploader + slug of song title)
2818                 slug_title =  mobj.group(2).decode('utf-8')
2819                 simple_title = uploader + u'-' + slug_title
2820
2821                 self.report_webpage('%s/%s' % (uploader, slug_title))
2822
2823                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2824                 try:
2825                         webpage = urllib2.urlopen(request).read()
2826                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2828                         return
2829
2830                 self.report_extraction('%s/%s' % (uploader, slug_title))
2831
2832                 # extract uid and stream token that soundcloud hands out for access
2833                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2834                 if mobj:
2835                         video_id = mobj.group(1)
2836                         stream_token = mobj.group(2)
2837
2838                 # extract unsimplified title
2839                 mobj = re.search('"title":"(.*?)",', webpage)
2840                 if mobj:
2841                         title = mobj.group(1).decode('utf-8')
2842                 else:
2843                         title = simple_title
2844
2845                 # construct media url (with uid/token)
2846                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2847                 mediaURL = mediaURL % (video_id, stream_token)
2848
2849                 # description
2850                 description = u'No description available'
2851                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2852                 if mobj:
2853                         description = mobj.group(1)
2854
2855                 # upload date
2856                 upload_date = None
2857                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2858                 if mobj:
2859                         try:
2860                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2861                         except Exception, e:
2862                                 self._downloader.to_stderr(u(e))
2863
2864                 # for soundcloud, a request to a cross domain is required for cookies
2865                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2866
2867                 return [{
2868                         'id':           video_id.decode('utf-8'),
2869                         'url':          mediaURL,
2870                         'uploader':     uploader.decode('utf-8'),
2871                         'upload_date':  upload_date,
2872                         'title':        title,
2873                         'ext':          u'mp3',
2874                         'format':       u'NA',
2875                         'player_url':   None,
2876                         'description': description.decode('utf-8')
2877                 }]
2878
2879
2880 class InfoQIE(InfoExtractor):
2881         """Information extractor for infoq.com"""
2882
2883         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2884         IE_NAME = u'infoq'
2885
2886         def report_webpage(self, video_id):
2887                 """Report information extraction."""
2888                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2889
2890         def report_extraction(self, video_id):
2891                 """Report information extraction."""
2892                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2893
2894         def _real_extract(self, url):
2895                 mobj = re.match(self._VALID_URL, url)
2896                 if mobj is None:
2897                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2898                         return
2899
2900                 self.report_webpage(url)
2901
2902                 request = urllib2.Request(url)
2903                 try:
2904                         webpage = urllib2.urlopen(request).read()
2905                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2906                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
2907                         return
2908
2909                 self.report_extraction(url)
2910
2911
2912                 # Extract video URL
2913                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2914                 if mobj is None:
2915                         self._downloader.trouble(u'ERROR: unable to extract video url')
2916                         return
2917                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2918
2919
2920                 # Extract title
2921                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2922                 if mobj is None:
2923                         self._downloader.trouble(u'ERROR: unable to extract video title')
2924                         return
2925                 video_title = mobj.group(1).decode('utf-8')
2926
2927                 # Extract description
2928                 video_description = u'No description available.'
2929                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2930                 if mobj is not None:
2931                         video_description = mobj.group(1).decode('utf-8')
2932
2933                 video_filename = video_url.split('/')[-1]
2934                 video_id, extension = video_filename.split('.')
2935
2936                 info = {
2937                         'id': video_id,
2938                         'url': video_url,
2939                         'uploader': None,
2940                         'upload_date': None,
2941                         'title': video_title,
2942                         'ext': extension,
2943                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2944                         'thumbnail': None,
2945                         'description': video_description,
2946                         'player_url': None,
2947                 }
2948
2949                 return [info]
2950
2951 class MixcloudIE(InfoExtractor):
2952         """Information extractor for www.mixcloud.com"""
2953         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2954         IE_NAME = u'mixcloud'
2955
2956         def __init__(self, downloader=None):
2957                 InfoExtractor.__init__(self, downloader)
2958
2959         def report_download_json(self, file_id):
2960                 """Report JSON download."""
2961                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2962
2963         def report_extraction(self, file_id):
2964                 """Report information extraction."""
2965                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2966
2967         def get_urls(self, jsonData, fmt, bitrate='best'):
2968                 """Get urls from 'audio_formats' section in json"""
2969                 file_url = None
2970                 try:
2971                         bitrate_list = jsonData[fmt]
2972                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2973                                 bitrate = max(bitrate_list) # select highest
2974
2975                         url_list = jsonData[fmt][bitrate]
2976                 except TypeError: # we have no bitrate info.
2977                         url_list = jsonData[fmt]
2978                 return url_list
2979
2980         def check_urls(self, url_list):
2981                 """Returns 1st active url from list"""
2982                 for url in url_list:
2983                         try:
2984                                 urllib2.urlopen(url)
2985                                 return url
2986                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987                                 url = None
2988
2989                 return None
2990
2991         def _print_formats(self, formats):
2992                 print('Available formats:')
2993                 for fmt in formats.keys():
2994                         for b in formats[fmt]:
2995                                 try:
2996                                         ext = formats[fmt][b][0]
2997                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2998                                 except TypeError: # we have no bitrate info
2999                                         ext = formats[fmt][0]
3000                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3001                                         break
3002
3003         def _real_extract(self, url):
3004                 mobj = re.match(self._VALID_URL, url)
3005                 if mobj is None:
3006                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3007                         return
3008                 # extract uploader & filename from url
3009                 uploader = mobj.group(1).decode('utf-8')
3010                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3011
3012                 # construct API request
3013                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3014                 # retrieve .json file with links to files
3015                 request = urllib2.Request(file_url)
3016                 try:
3017                         self.report_download_json(file_url)
3018                         jsonData = urllib2.urlopen(request).read()
3019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3020                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % u(err))
3021                         return
3022
3023                 # parse JSON
3024                 json_data = json.loads(jsonData)
3025                 player_url = json_data['player_swf_url']
3026                 formats = dict(json_data['audio_formats'])
3027
3028                 req_format = self._downloader.params.get('format', None)
3029                 bitrate = None
3030
3031                 if self._downloader.params.get('listformats', None):
3032                         self._print_formats(formats)
3033                         return
3034
3035                 if req_format is None or req_format == 'best':
3036                         for format_param in formats.keys():
3037                                 url_list = self.get_urls(formats, format_param)
3038                                 # check urls
3039                                 file_url = self.check_urls(url_list)
3040                                 if file_url is not None:
3041                                         break # got it!
3042                 else:
3043                         if req_format not in formats.keys():
3044                                 self._downloader.trouble(u'ERROR: format is not available')
3045                                 return
3046
3047                         url_list = self.get_urls(formats, req_format)
3048                         file_url = self.check_urls(url_list)
3049                         format_param = req_format
3050
3051                 return [{
3052                         'id': file_id.decode('utf-8'),
3053                         'url': file_url.decode('utf-8'),
3054                         'uploader':     uploader.decode('utf-8'),
3055                         'upload_date': u'NA',
3056                         'title': json_data['name'],
3057                         'ext': file_url.split('.')[-1].decode('utf-8'),
3058                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3059                         'thumbnail': json_data['thumbnail_url'],
3060                         'description': json_data['description'],
3061                         'player_url': player_url.decode('utf-8'),
3062                 }]
3063
3064 class StanfordOpenClassroomIE(InfoExtractor):
3065         """Information extractor for Stanford's Open ClassRoom"""
3066
3067         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3068         IE_NAME = u'stanfordoc'
3069
3070         def report_download_webpage(self, objid):
3071                 """Report information extraction."""
3072                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3073
3074         def report_extraction(self, video_id):
3075                 """Report information extraction."""
3076                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3077
3078         def _real_extract(self, url):
3079                 mobj = re.match(self._VALID_URL, url)
3080                 if mobj is None:
3081                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3082                         return
3083
3084                 if mobj.group('course') and mobj.group('video'): # A specific video
3085                         course = mobj.group('course')
3086                         video = mobj.group('video')
3087                         info = {
3088                                 'id': course + '_' + video,
3089                         }
3090
3091                         self.report_extraction(info['id'])
3092                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3093                         xmlUrl = baseUrl + video + '.xml'
3094                         try:
3095                                 metaXml = urllib2.urlopen(xmlUrl).read()
3096                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3097                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % u(err))
3098                                 return
3099                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3100                         try:
3101                                 info['title'] = mdoc.findall('./title')[0].text
3102                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3103                         except IndexError:
3104                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3105                                 return
3106                         info['ext'] = info['url'].rpartition('.')[2]
3107                         info['format'] = info['ext']
3108                         return [info]
3109                 elif mobj.group('course'): # A course page
3110                         course = mobj.group('course')
3111                         info = {
3112                                 'id': course,
3113                                 'type': 'playlist',
3114                         }
3115
3116                         self.report_download_webpage(info['id'])
3117                         try:
3118                                 coursepage = urllib2.urlopen(url).read()
3119                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3120                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3121                                 return
3122
3123                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3124                         if m:
3125                                 info['title'] = unescapeHTML(m.group(1))
3126                         else:
3127                                 info['title'] = info['id']
3128
3129                         m = re.search('<description>([^<]+)</description>', coursepage)
3130                         if m:
3131                                 info['description'] = unescapeHTML(m.group(1))
3132
3133                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3134                         info['list'] = [
3135                                 {
3136                                         'type': 'reference',
3137                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3138                                 }
3139                                         for vpage in links]
3140                         results = []
3141                         for entry in info['list']:
3142                                 assert entry['type'] == 'reference'
3143                                 results += self.extract(entry['url'])
3144                         return results
3145
3146                 else: # Root page
3147                         info = {
3148                                 'id': 'Stanford OpenClassroom',
3149                                 'type': 'playlist',
3150                         }
3151
3152                         self.report_download_webpage(info['id'])
3153                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3154                         try:
3155                                 rootpage = urllib2.urlopen(rootURL).read()
3156                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3157                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + u(err))
3158                                 return
3159
3160                         info['title'] = info['id']
3161
3162                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3163                         info['list'] = [
3164                                 {
3165                                         'type': 'reference',
3166                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3167                                 }
3168                                         for cpage in links]
3169
3170                         results = []
3171                         for entry in info['list']:
3172                                 assert entry['type'] == 'reference'
3173                                 results += self.extract(entry['url'])
3174                         return results
3175
3176 class MTVIE(InfoExtractor):
3177         """Information extractor for MTV.com"""
3178
3179         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3180         IE_NAME = u'mtv'
3181
3182         def report_webpage(self, video_id):
3183                 """Report information extraction."""
3184                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3185
3186         def report_extraction(self, video_id):
3187                 """Report information extraction."""
3188                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3189
3190         def _real_extract(self, url):
3191                 mobj = re.match(self._VALID_URL, url)
3192                 if mobj is None:
3193                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3194                         return
3195                 if not mobj.group('proto'):
3196                         url = 'http://' + url
3197                 video_id = mobj.group('videoid')
3198                 self.report_webpage(video_id)
3199
3200                 request = urllib2.Request(url)
3201                 try:
3202                         webpage = urllib2.urlopen(request).read()
3203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3204                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err))
3205                         return
3206
3207                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3208                 if mobj is None:
3209                         self._downloader.trouble(u'ERROR: unable to extract song name')
3210                         return
3211                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3212                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3213                 if mobj is None:
3214                         self._downloader.trouble(u'ERROR: unable to extract performer')
3215                         return
3216                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3217                 video_title = performer + ' - ' + song_name
3218
3219                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3220                 if mobj is None:
3221                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3222                         return
3223                 mtvn_uri = mobj.group(1)
3224
3225                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3226                 if mobj is None:
3227                         self._downloader.trouble(u'ERROR: unable to extract content id')
3228                         return
3229                 content_id = mobj.group(1)
3230
3231                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3232                 self.report_extraction(video_id)
3233                 request = urllib2.Request(videogen_url)
3234                 try:
3235                         metadataXml = urllib2.urlopen(request).read()
3236                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3237                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % u(err))
3238                         return
3239
3240                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3241                 renditions = mdoc.findall('.//rendition')
3242
3243                 # For now, always pick the highest quality.
3244                 rendition = renditions[-1]
3245
3246                 try:
3247                         _,_,ext = rendition.attrib['type'].partition('/')
3248                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3249                         video_url = rendition.find('./src').text
3250                 except KeyError:
3251                         self._downloader.trouble('Invalid rendition field.')
3252                         return
3253
3254                 info = {
3255                         'id': video_id,
3256                         'url': video_url,
3257                         'uploader': performer,
3258                         'title': video_title,
3259                         'ext': ext,
3260                         'format': format,
3261                 }
3262
3263                 return [info]
3264
3265
3266 class YoukuIE(InfoExtractor):
3267
3268         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3269         IE_NAME = u'Youku'
3270
3271         def __init__(self, downloader=None):
3272                 InfoExtractor.__init__(self, downloader)
3273
3274         def report_download_webpage(self, file_id):
3275                 """Report webpage download."""
3276                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3277
3278         def report_extraction(self, file_id):
3279                 """Report information extraction."""
3280                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3281
3282         def _gen_sid(self):
3283                 nowTime = int(time.time() * 1000)
3284                 random1 = random.randint(1000,1998)
3285                 random2 = random.randint(1000,9999)
3286
3287                 return "%d%d%d" %(nowTime,random1,random2)
3288
3289         def _get_file_ID_mix_string(self, seed):
3290                 mixed = []
3291                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3292                 seed = float(seed)
3293                 for i in range(len(source)):
3294                         seed  =  (seed * 211 + 30031 ) % 65536
3295                         index  =  math.floor(seed / 65536 * len(source) )
3296                         mixed.append(source[int(index)])
3297                         source.remove(source[int(index)])
3298                 #return ''.join(mixed)
3299                 return mixed
3300
3301         def _get_file_id(self, fileId, seed):
3302                 mixed = self._get_file_ID_mix_string(seed)
3303                 ids = fileId.split('*')
3304                 realId = []
3305                 for ch in ids:
3306                         if ch:
3307                                 realId.append(mixed[int(ch)])
3308                 return ''.join(realId)
3309
3310         def _real_extract(self, url):
3311                 mobj = re.match(self._VALID_URL, url)
3312                 if mobj is None:
3313                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3314                         return
3315                 video_id = mobj.group('ID')
3316
3317                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3318
3319                 request = urllib2.Request(info_url, None, std_headers)
3320                 try:
3321                         self.report_download_webpage(video_id)
3322                         jsondata = urllib2.urlopen(request).read()
3323                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3324                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3325                         return
3326
3327                 self.report_extraction(video_id)
3328                 try:
3329                         config = json.loads(jsondata)
3330
3331                         video_title =  config['data'][0]['title']
3332                         seed = config['data'][0]['seed']
3333
3334                         format = self._downloader.params.get('format', None)
3335                         supported_format = config['data'][0]['streamfileids'].keys()
3336
3337                         if format is None or format == 'best':
3338                                 if 'hd2' in supported_format:
3339                                         format = 'hd2'
3340                                 else:
3341                                         format = 'flv'
3342                                 ext = u'flv'
3343                         elif format == 'worst':
3344                                 format = 'mp4'
3345                                 ext = u'mp4'
3346                         else:
3347                                 format = 'flv'
3348                                 ext = u'flv'
3349
3350
3351                         fileid = config['data'][0]['streamfileids'][format]
3352                         seg_number = len(config['data'][0]['segs'][format])
3353
3354                         keys=[]
3355                         for i in xrange(seg_number):
3356                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3357
3358                         #TODO check error
3359                         #youku only could be viewed from mainland china
3360                 except:
3361                         self._downloader.trouble(u'ERROR: unable to extract info section')
3362                         return
3363
3364                 files_info=[]
3365                 sid = self._gen_sid()
3366                 fileid = self._get_file_id(fileid, seed)
3367
3368                 #column 8,9 of fileid represent the segment number
3369                 #fileid[7:9] should be changed
3370                 for index, key in enumerate(keys):
3371
3372                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3373                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3374
3375                         info = {
3376                                 'id': '%s_part%02d' % (video_id, index),
3377                                 'url': download_url,
3378                                 'uploader': None,
3379                                 'title': video_title,
3380                                 'ext': ext,
3381                                 'format': u'NA'
3382                         }
3383                         files_info.append(info)
3384
3385                 return files_info
3386
3387
3388 class XNXXIE(InfoExtractor):
3389         """Information extractor for xnxx.com"""
3390
3391         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3392         IE_NAME = u'xnxx'
3393         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3394         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3395         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3396
3397         def report_webpage(self, video_id):
3398                 """Report information extraction"""
3399                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3400
3401         def report_extraction(self, video_id):
3402                 """Report information extraction"""
3403                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3404
3405         def _real_extract(self, url):
3406                 mobj = re.match(self._VALID_URL, url)
3407                 if mobj is None:
3408                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3409                         return
3410                 video_id = mobj.group(1).decode('utf-8')
3411
3412                 self.report_webpage(video_id)
3413
3414                 # Get webpage content
3415                 try:
3416                         webpage = urllib2.urlopen(url).read()
3417                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3418                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3419                         return
3420
3421                 result = re.search(self.VIDEO_URL_RE, webpage)
3422                 if result is None:
3423                         self._downloader.trouble(u'ERROR: unable to extract video url')
3424                         return
3425                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3426
3427                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3428                 if result is None:
3429                         self._downloader.trouble(u'ERROR: unable to extract video title')
3430                         return
3431                 video_title = result.group(1).decode('utf-8')
3432
3433                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3434                 if result is None:
3435                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3436                         return
3437                 video_thumbnail = result.group(1).decode('utf-8')
3438
3439                 info = {'id': video_id,
3440                                 'url': video_url,
3441                                 'uploader': None,
3442                                 'upload_date': None,
3443                                 'title': video_title,
3444                                 'ext': 'flv',
3445                                 'format': 'flv',
3446                                 'thumbnail': video_thumbnail,
3447                                 'description': None,
3448                                 'player_url': None}
3449
3450                 return [info]
3451
3452
3453 class GooglePlusIE(InfoExtractor):
3454         """Information extractor for plus.google.com."""
3455
3456         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3457         IE_NAME = u'plus.google'
3458
3459         def __init__(self, downloader=None):
3460                 InfoExtractor.__init__(self, downloader)
3461
3462         def report_extract_entry(self, url):
3463                 """Report downloading extry"""
3464                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3465
3466         def report_date(self, upload_date):
3467                 """Report downloading extry"""
3468                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3469
3470         def report_uploader(self, uploader):
3471                 """Report downloading extry"""
3472                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3473
3474         def report_title(self, video_title):
3475                 """Report downloading extry"""
3476                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3477
3478         def report_extract_vid_page(self, video_page):
3479                 """Report information extraction."""
3480                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3481
3482         def _real_extract(self, url):
3483                 # Extract id from URL
3484                 mobj = re.match(self._VALID_URL, url)
3485                 if mobj is None:
3486                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3487                         return
3488
3489                 post_url = mobj.group(0)
3490                 video_id = mobj.group(2)
3491
3492                 video_extension = 'flv'
3493
3494                 # Step 1, Retrieve post webpage to extract further information
3495                 self.report_extract_entry(post_url)
3496                 request = urllib2.Request(post_url)
3497                 try:
3498                         webpage = urllib2.urlopen(request).read()
3499                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3500                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % u(err))
3501                         return
3502
3503                 # Extract update date
3504                 upload_date = u'NA'
3505                 pattern = 'title="Timestamp">(.*?)</a>'
3506                 mobj = re.search(pattern, webpage)
3507                 if mobj:
3508                         upload_date = mobj.group(1)
3509                         # Convert timestring to a format suitable for filename
3510                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3511                         upload_date = upload_date.strftime('%Y%m%d')
3512                 self.report_date(upload_date)
3513
3514                 # Extract uploader
3515                 uploader = u'NA'
3516                 pattern = r'rel\="author".*?>(.*?)</a>'
3517                 mobj = re.search(pattern, webpage)
3518                 if mobj:
3519                         uploader = mobj.group(1)
3520                 self.report_uploader(uploader)
3521
3522                 # Extract title
3523                 # Get the first line for title
3524                 video_title = u'NA'
3525                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3526                 mobj = re.search(pattern, webpage)
3527                 if mobj:
3528                         video_title = mobj.group(1)
3529                 self.report_title(video_title)
3530
3531                 # Step 2, Stimulate clicking the image box to launch video
3532                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3533                 mobj = re.search(pattern, webpage)
3534                 if mobj is None:
3535                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3536
3537                 video_page = mobj.group(1)
3538                 request = urllib2.Request(video_page)
3539                 try:
3540                         webpage = urllib2.urlopen(request).read()
3541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3542                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err))
3543                         return
3544                 self.report_extract_vid_page(video_page)
3545
3546
3547                 # Extract video links on video page
3548                 """Extract video links of all sizes"""
3549                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3550                 mobj = re.findall(pattern, webpage)
3551                 if len(mobj) == 0:
3552                         self._downloader.trouble(u'ERROR: unable to extract video links')
3553
3554                 # Sort in resolution
3555                 links = sorted(mobj)
3556
3557                 # Choose the lowest of the sort, i.e. highest resolution
3558                 video_url = links[-1]
3559                 # Only get the url. The resolution part in the tuple has no use anymore
3560                 video_url = video_url[-1]
3561                 # Treat escaped \u0026 style hex
3562                 video_url = unicode(video_url, "unicode_escape")
3563
3564
3565                 return [{
3566                         'id':           video_id.decode('utf-8'),
3567                         'url':          video_url,
3568                         'uploader':     uploader.decode('utf-8'),
3569                         'upload_date':  upload_date.decode('utf-8'),
3570                         'title':        video_title.decode('utf-8'),
3571                         'ext':          video_extension.decode('utf-8'),
3572                         'format':       u'NA',
3573                         'player_url':   None,
3574                 }]