_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 106                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 107                              (?:                                                  # the various things that can precede the ID:
 108                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 109                                  |(?:                                             # or the v= param in all its forms
 110                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 111                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 112                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 113                                      v=
 114                                  )
 115                              )?                                                   # optional -> youtube.com/xxxx is OK
 116                          )?                                                       # all until now is optional -> you can pass the naked ID
 117                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 118                          (?(1).+)?                                                # if we found the ID, everything can follow
 119                          $"""
 120         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 121         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 122         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 123         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 124         _NETRC_MACHINE = 'youtube'
 125         # Listed in order of quality
 126         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 127         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 128         _video_extensions = {
 129                 '13': '3gp',
 130                 '17': 'mp4',
 131                 '18': 'mp4',
 132                 '22': 'mp4',
 133                 '37': 'mp4',
 134                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 135                 '43': 'webm',
 136                 '44': 'webm',
 137                 '45': 'webm',
 138                 '46': 'webm',
 139         }
 140         _video_dimensions = {
 141                 '5': '240x400',
 142                 '6': '???',
 143                 '13': '???',
 144                 '17': '144x176',
 145                 '18': '360x640',
 146                 '22': '720x1280',
 147                 '34': '360x640',
 148                 '35': '480x854',
 149                 '37': '1080x1920',
 150                 '38': '3072x4096',
 151                 '43': '360x640',
 152                 '44': '480x854',
 153                 '45': '720x1280',
 154                 '46': '1080x1920',
 155         }
 156         IE_NAME = u'youtube'
 157
 158         def suitable(self, url):
 159                 """Receives a URL and returns True if suitable for this IE."""
 160                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 161
 162         def report_lang(self):
 163                 """Report attempt to set language."""
 164                 self._downloader.to_screen(u'[youtube] Setting language')
 165
 166         def report_login(self):
 167                 """Report attempt to log in."""
 168                 self._downloader.to_screen(u'[youtube] Logging in')
 169
 170         def report_age_confirmation(self):
 171                 """Report attempt to confirm age."""
 172                 self._downloader.to_screen(u'[youtube] Confirming age')
 173
 174         def report_video_webpage_download(self, video_id):
 175                 """Report attempt to download video webpage."""
 176                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 177
 178         def report_video_info_webpage_download(self, video_id):
 179                 """Report attempt to download video info webpage."""
 180                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 181
 182         def report_video_subtitles_download(self, video_id):
 183                 """Report attempt to download video info webpage."""
 184                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 185
 186         def report_information_extraction(self, video_id):
 187                 """Report attempt to extract video information."""
 188                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 189
 190         def report_unavailable_format(self, video_id, format):
 191                 """Report extracted video URL."""
 192                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 193
 194         def report_rtmp_download(self):
 195                 """Indicate the download will use the RTMP protocol."""
 196                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 197
 198         def _closed_captions_xml_to_srt(self, xml_string):
 199                 srt = ''
 200                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 201                 # TODO parse xml instead of regex
 202                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 203                         if not dur: dur = '4'
 204                         start = float(start)
 205                         end = start + float(dur)
 206                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 207                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 208                         caption = unescapeHTML(caption)
 209                         caption = unescapeHTML(caption) # double cycle, intentional
 210                         srt += str(n+1) + '\n'
 211                         srt += start + ' --> ' + end + '\n'
 212                         srt += caption + '\n\n'
 213                 return srt
 214
 215         def _print_formats(self, formats):
 216                 print('Available formats:')
 217                 for x in formats:
 218                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 219
 220         def _real_initialize(self):
 221                 if self._downloader is None:
 222                         return
 223
 224                 username = None
 225                 password = None
 226                 downloader_params = self._downloader.params
 227
 228                 # Attempt to use provided username and password or .netrc data
 229                 if downloader_params.get('username', None) is not None:
 230                         username = downloader_params['username']
 231                         password = downloader_params['password']
 232                 elif downloader_params.get('usenetrc', False):
 233                         try:
 234                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 235                                 if info is not None:
 236                                         username = info[0]
 237                                         password = info[2]
 238                                 else:
 239                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 240                         except (IOError, netrc.NetrcParseError), err:
 241                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 242                                 return
 243
 244                 # Set language
 245                 request = urllib2.Request(self._LANG_URL)
 246                 try:
 247                         self.report_lang()
 248                         urllib2.urlopen(request).read()
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 251                         return
 252
 253                 # No authentication to be performed
 254                 if username is None:
 255                         return
 256
 257                 # Log in
 258                 login_form = {
 259                                 'current_form': 'loginForm',
 260                                 'next':         '/',
 261                                 'action_login': 'Log In',
 262                                 'username':     username,
 263                                 'password':     password,
 264                                 }
 265                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 266                 try:
 267                         self.report_login()
 268                         login_results = urllib2.urlopen(request).read()
 269                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 270                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 271                                 return
 272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 273                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 274                         return
 275
 276                 # Confirm age
 277                 age_form = {
 278                                 'next_url':             '/',
 279                                 'action_confirm':       'Confirm',
 280                                 }
 281                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 282                 try:
 283                         self.report_age_confirmation()
 284                         age_results = urllib2.urlopen(request).read()
 285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 286                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 287                         return
 288
 289         def _real_extract(self, url):
 290                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 291                 mobj = re.search(self._NEXT_URL_RE, url)
 292                 if mobj:
 293                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 294
 295                 # Extract video id from URL
 296                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 297                 if mobj is None:
 298                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 299                         return
 300                 video_id = mobj.group(2)
 301
 302                 # Get video webpage
 303                 self.report_video_webpage_download(video_id)
 304                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 305                 try:
 306                         video_webpage = urllib2.urlopen(request).read()
 307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 308                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 309                         return
 310
 311                 # Attempt to extract SWF player URL
 312                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 313                 if mobj is not None:
 314                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 315                 else:
 316                         player_url = None
 317
 318                 # Get video info
 319                 self.report_video_info_webpage_download(video_id)
 320                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 321                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 322                                         % (video_id, el_type))
 323                         request = urllib2.Request(video_info_url)
 324                         try:
 325                                 video_info_webpage = urllib2.urlopen(request).read()
 326                                 video_info = parse_qs(video_info_webpage)
 327                                 if 'token' in video_info:
 328                                         break
 329                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 330                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 331                                 return
 332                 if 'token' not in video_info:
 333                         if 'reason' in video_info:
 334                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 335                         else:
 336                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 337                         return
 338
 339                 # Check for "rental" videos
 340                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 341                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 342                         return
 343
 344                 # Start extracting information
 345                 self.report_information_extraction(video_id)
 346
 347                 # uploader
 348                 if 'author' not in video_info:
 349                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 350                         return
 351                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 352
 353                 # title
 354                 if 'title' not in video_info:
 355                         self._downloader.trouble(u'ERROR: unable to extract video title')
 356                         return
 357                 video_title = urllib.unquote_plus(video_info['title'][0])
 358                 video_title = video_title.decode('utf-8')
 359
 360                 # thumbnail image
 361                 if 'thumbnail_url' not in video_info:
 362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 363                         video_thumbnail = ''
 364                 else:   # don't panic if we can't find it
 365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 366
 367                 # upload date
 368                 upload_date = u'NA'
 369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 370                 if mobj is not None:
 371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 373                         for expression in format_expressions:
 374                                 try:
 375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 376                                 except:
 377                                         pass
 378
 379                 # description
 380                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 381                 if video_description: video_description = clean_html(video_description)
 382                 else: video_description = ''
 383
 384                 # closed captions
 385                 video_subtitles = None
 386                 if self._downloader.params.get('writesubtitles', False):
 387                         try:
 388                                 self.report_video_subtitles_download(video_id)
 389                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 390                                 try:
 391                                         srt_list = urllib2.urlopen(request).read()
 392                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 393                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 394                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 395                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 396                                 if not srt_lang_list:
 397                                         raise Trouble(u'WARNING: video has no closed captions')
 398                                 if self._downloader.params.get('subtitleslang', False):
 399                                         srt_lang = self._downloader.params.get('subtitleslang')
 400                                 elif 'en' in srt_lang_list:
 401                                         srt_lang = 'en'
 402                                 else:
 403                                         srt_lang = srt_lang_list.keys()[0]
 404                                 if not srt_lang in srt_lang_list:
 405                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 406                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 407                                 try:
 408                                         srt_xml = urllib2.urlopen(request).read()
 409                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 410                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
 411                                 if not srt_xml:
 412                                         raise Trouble(u'WARNING: unable to download video subtitles')
 413                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 414                         except Trouble as trouble:
 415                                 self._downloader.trouble(trouble[0])
 416
 417                 if 'length_seconds' not in video_info:
 418                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 419                         video_duration = ''
 420                 else:
 421                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
 422
 423                 # token
 424                 video_token = urllib.unquote_plus(video_info['token'][0])
 425
 426                 # Decide which formats to download
 427                 req_format = self._downloader.params.get('format', None)
 428
 429                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 430                         self.report_rtmp_download()
 431                         video_url_list = [(None, video_info['conn'][0])]
 432                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 433                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 434                         url_data = [parse_qs(uds) for uds in url_data_strs]
 435                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 436                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 437
 438                         format_limit = self._downloader.params.get('format_limit', None)
 439                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 440                         if format_limit is not None and format_limit in available_formats:
 441                                 format_list = available_formats[available_formats.index(format_limit):]
 442                         else:
 443                                 format_list = available_formats
 444                         existing_formats = [x for x in format_list if x in url_map]
 445                         if len(existing_formats) == 0:
 446                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 447                                 return
 448                         if self._downloader.params.get('listformats', None):
 449                                 self._print_formats(existing_formats)
 450                                 return
 451                         if req_format is None or req_format == 'best':
 452                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 453                         elif req_format == 'worst':
 454                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 455                         elif req_format in ('-1', 'all'):
 456                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 457                         else:
 458                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 459                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 460                                 req_formats = req_format.split('/')
 461                                 video_url_list = None
 462                                 for rf in req_formats:
 463                                         if rf in url_map:
 464                                                 video_url_list = [(rf, url_map[rf])]
 465                                                 break
 466                                 if video_url_list is None:
 467                                         self._downloader.trouble(u'ERROR: requested format not available')
 468                                         return
 469                 else:
 470                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 471                         return
 472
 473                 results = []
 474                 for format_param, video_real_url in video_url_list:
 475                         # Extension
 476                         video_extension = self._video_extensions.get(format_param, 'flv')
 477
 478                         results.append({
 479                                 'id':           video_id.decode('utf-8'),
 480                                 'url':          video_real_url.decode('utf-8'),
 481                                 'uploader':     video_uploader.decode('utf-8'),
 482                                 'upload_date':  upload_date,
 483                                 'title':        video_title,
 484                                 'ext':          video_extension.decode('utf-8'),
 485                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 486                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 487                                 'description':  video_description,
 488                                 'player_url':   player_url,
 489                                 'subtitles':    video_subtitles,
 490                                 'duration':             video_duration
 491                         })
 492                 return results
 493
 494
 495 class MetacafeIE(InfoExtractor):
 496         """Information Extractor for metacafe.com."""
 497
 498         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 499         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 500         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 501         IE_NAME = u'metacafe'
 502
 503         def __init__(self, downloader=None):
 504                 InfoExtractor.__init__(self, downloader)
 505
 506         def report_disclaimer(self):
 507                 """Report disclaimer retrieval."""
 508                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 509
 510         def report_age_confirmation(self):
 511                 """Report attempt to confirm age."""
 512                 self._downloader.to_screen(u'[metacafe] Confirming age')
 513
 514         def report_download_webpage(self, video_id):
 515                 """Report webpage download."""
 516                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 517
 518         def report_extraction(self, video_id):
 519                 """Report information extraction."""
 520                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 521
 522         def _real_initialize(self):
 523                 # Retrieve disclaimer
 524                 request = urllib2.Request(self._DISCLAIMER)
 525                 try:
 526                         self.report_disclaimer()
 527                         disclaimer = urllib2.urlopen(request).read()
 528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 529                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 530                         return
 531
 532                 # Confirm age
 533                 disclaimer_form = {
 534                         'filters': '0',
 535                         'submit': "Continue - I'm over 18",
 536                         }
 537                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 538                 try:
 539                         self.report_age_confirmation()
 540                         disclaimer = urllib2.urlopen(request).read()
 541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 542                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 543                         return
 544
 545         def _real_extract(self, url):
 546                 # Extract id and simplified title from URL
 547                 mobj = re.match(self._VALID_URL, url)
 548                 if mobj is None:
 549                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 550                         return
 551
 552                 video_id = mobj.group(1)
 553
 554                 # Check if video comes from YouTube
 555                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 556                 if mobj2 is not None:
 557                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 558                         return
 559
 560                 # Retrieve video webpage to extract further information
 561                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 562                 try:
 563                         self.report_download_webpage(video_id)
 564                         webpage = urllib2.urlopen(request).read()
 565                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 566                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 567                         return
 568
 569                 # Extract URL, uploader and title from webpage
 570                 self.report_extraction(video_id)
 571                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 572                 if mobj is not None:
 573                         mediaURL = urllib.unquote(mobj.group(1))
 574                         video_extension = mediaURL[-3:]
 575
 576                         # Extract gdaKey if available
 577                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 578                         if mobj is None:
 579                                 video_url = mediaURL
 580                         else:
 581                                 gdaKey = mobj.group(1)
 582                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 583                 else:
 584                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         vardict = parse_qs(mobj.group(1))
 589                         if 'mediaData' not in vardict:
 590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 591                                 return
 592                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 593                         if mobj is None:
 594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 595                                 return
 596                         mediaURL = mobj.group(1).replace('\\/', '/')
 597                         video_extension = mediaURL[-3:]
 598                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 599
 600                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 601                 if mobj is None:
 602                         self._downloader.trouble(u'ERROR: unable to extract title')
 603                         return
 604                 video_title = mobj.group(1).decode('utf-8')
 605
 606                 mobj = re.search(r'submitter=(.*?);', webpage)
 607                 if mobj is None:
 608                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 609                         return
 610                 video_uploader = mobj.group(1)
 611
 612                 return [{
 613                         'id':           video_id.decode('utf-8'),
 614                         'url':          video_url.decode('utf-8'),
 615                         'uploader':     video_uploader.decode('utf-8'),
 616                         'upload_date':  u'NA',
 617                         'title':        video_title,
 618                         'ext':          video_extension.decode('utf-8'),
 619                         'format':       u'NA',
 620                         'player_url':   None,
 621                 }]
 622
 623
 624 class DailymotionIE(InfoExtractor):
 625         """Information Extractor for Dailymotion"""
 626
 627         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 628         IE_NAME = u'dailymotion'
 629
 630         def __init__(self, downloader=None):
 631                 InfoExtractor.__init__(self, downloader)
 632
 633         def report_download_webpage(self, video_id):
 634                 """Report webpage download."""
 635                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 636
 637         def report_extraction(self, video_id):
 638                 """Report information extraction."""
 639                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 640
 641         def _real_extract(self, url):
 642                 # Extract id and simplified title from URL
 643                 mobj = re.match(self._VALID_URL, url)
 644                 if mobj is None:
 645                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 646                         return
 647
 648                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 649
 650                 video_extension = 'mp4'
 651
 652                 # Retrieve video webpage to extract further information
 653                 request = urllib2.Request(url)
 654                 request.add_header('Cookie', 'family_filter=off')
 655                 try:
 656                         self.report_download_webpage(video_id)
 657                         webpage = urllib2.urlopen(request).read()
 658                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 659                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 660                         return
 661
 662                 # Extract URL, uploader and title from webpage
 663                 self.report_extraction(video_id)
 664                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 665                 if mobj is None:
 666                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 667                         return
 668                 flashvars = urllib.unquote(mobj.group(1))
 669
 670                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 671                         if key in flashvars:
 672                                 max_quality = key
 673                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 674                                 break
 675                 else:
 676                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 677                         return
 678
 679                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 680                 if mobj is None:
 681                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 682                         return
 683
 684                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 685
 686                 # TODO: support choosing qualities
 687
 688                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 689                 if mobj is None:
 690                         self._downloader.trouble(u'ERROR: unable to extract title')
 691                         return
 692                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 693
 694                 video_uploader = u'NA'
 695                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 696                 if mobj is None:
 697                         # lookin for official user
 698                         mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 699                         if mobj_official is None:
 700                                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 701                         else:
 702                                 video_uploader = mobj_official.group(1)
 703                 else:
 704                         video_uploader = mobj.group(1)
 705
 706                 video_upload_date = u'NA'
 707                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 708                 if mobj is not None:
 709                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 710
 711                 return [{
 712                         'id':           video_id.decode('utf-8'),
 713                         'url':          video_url.decode('utf-8'),
 714                         'uploader':     video_uploader.decode('utf-8'),
 715                         'upload_date':  video_upload_date,
 716                         'title':        video_title,
 717                         'ext':          video_extension.decode('utf-8'),
 718                         'format':       u'NA',
 719                         'player_url':   None,
 720                 }]
 721
 722
 723 class GoogleIE(InfoExtractor):
 724         """Information extractor for video.google.com."""
 725
 726         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 727         IE_NAME = u'video.google'
 728
 729         def __init__(self, downloader=None):
 730                 InfoExtractor.__init__(self, downloader)
 731
 732         def report_download_webpage(self, video_id):
 733                 """Report webpage download."""
 734                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 735
 736         def report_extraction(self, video_id):
 737                 """Report information extraction."""
 738                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 739
 740         def _real_extract(self, url):
 741                 # Extract id from URL
 742                 mobj = re.match(self._VALID_URL, url)
 743                 if mobj is None:
 744                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 745                         return
 746
 747                 video_id = mobj.group(1)
 748
 749                 video_extension = 'mp4'
 750
 751                 # Retrieve video webpage to extract further information
 752                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 753                 try:
 754                         self.report_download_webpage(video_id)
 755                         webpage = urllib2.urlopen(request).read()
 756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 757                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 758                         return
 759
 760                 # Extract URL, uploader, and title from webpage
 761                 self.report_extraction(video_id)
 762                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 763                 if mobj is None:
 764                         video_extension = 'flv'
 765                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 766                 if mobj is None:
 767                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 768                         return
 769                 mediaURL = urllib.unquote(mobj.group(1))
 770                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 771                 mediaURL = mediaURL.replace('\\x26', '\x26')
 772
 773                 video_url = mediaURL
 774
 775                 mobj = re.search(r'<title>(.*)</title>', webpage)
 776                 if mobj is None:
 777                         self._downloader.trouble(u'ERROR: unable to extract title')
 778                         return
 779                 video_title = mobj.group(1).decode('utf-8')
 780
 781                 # Extract video description
 782                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 783                 if mobj is None:
 784                         self._downloader.trouble(u'ERROR: unable to extract video description')
 785                         return
 786                 video_description = mobj.group(1).decode('utf-8')
 787                 if not video_description:
 788                         video_description = 'No description available.'
 789
 790                 # Extract video thumbnail
 791                 if self._downloader.params.get('forcethumbnail', False):
 792                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 793                         try:
 794                                 webpage = urllib2.urlopen(request).read()
 795                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 796                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 797                                 return
 798                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 799                         if mobj is None:
 800                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 801                                 return
 802                         video_thumbnail = mobj.group(1)
 803                 else:   # we need something to pass to process_info
 804                         video_thumbnail = ''
 805
 806                 return [{
 807                         'id':           video_id.decode('utf-8'),
 808                         'url':          video_url.decode('utf-8'),
 809                         'uploader':     u'NA',
 810                         'upload_date':  u'NA',
 811                         'title':        video_title,
 812                         'ext':          video_extension.decode('utf-8'),
 813                         'format':       u'NA',
 814                         'player_url':   None,
 815                 }]
 816
 817
 818 class PhotobucketIE(InfoExtractor):
 819         """Information extractor for photobucket.com."""
 820
 821         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 822         IE_NAME = u'photobucket'
 823
 824         def __init__(self, downloader=None):
 825                 InfoExtractor.__init__(self, downloader)
 826
 827         def report_download_webpage(self, video_id):
 828                 """Report webpage download."""
 829                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 830
 831         def report_extraction(self, video_id):
 832                 """Report information extraction."""
 833                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 834
 835         def _real_extract(self, url):
 836                 # Extract id from URL
 837                 mobj = re.match(self._VALID_URL, url)
 838                 if mobj is None:
 839                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 840                         return
 841
 842                 video_id = mobj.group(1)
 843
 844                 video_extension = 'flv'
 845
 846                 # Retrieve video webpage to extract further information
 847                 request = urllib2.Request(url)
 848                 try:
 849                         self.report_download_webpage(video_id)
 850                         webpage = urllib2.urlopen(request).read()
 851                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 852                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 853                         return
 854
 855                 # Extract URL, uploader, and title from webpage
 856                 self.report_extraction(video_id)
 857                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 858                 if mobj is None:
 859                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 860                         return
 861                 mediaURL = urllib.unquote(mobj.group(1))
 862
 863                 video_url = mediaURL
 864
 865                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 866                 if mobj is None:
 867                         self._downloader.trouble(u'ERROR: unable to extract title')
 868                         return
 869                 video_title = mobj.group(1).decode('utf-8')
 870
 871                 video_uploader = mobj.group(2).decode('utf-8')
 872
 873                 return [{
 874                         'id':           video_id.decode('utf-8'),
 875                         'url':          video_url.decode('utf-8'),
 876                         'uploader':     video_uploader,
 877                         'upload_date':  u'NA',
 878                         'title':        video_title,
 879                         'ext':          video_extension.decode('utf-8'),
 880                         'format':       u'NA',
 881                         'player_url':   None,
 882                 }]
 883
 884
 885 class YahooIE(InfoExtractor):
 886         """Information extractor for video.yahoo.com."""
 887
 888         # _VALID_URL matches all Yahoo! Video URLs
 889         # _VPAGE_URL matches only the extractable '/watch/' URLs
 890         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 891         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 892         IE_NAME = u'video.yahoo'
 893
 894         def __init__(self, downloader=None):
 895                 InfoExtractor.__init__(self, downloader)
 896
 897         def report_download_webpage(self, video_id):
 898                 """Report webpage download."""
 899                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 900
 901         def report_extraction(self, video_id):
 902                 """Report information extraction."""
 903                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 904
 905         def _real_extract(self, url, new_video=True):
 906                 # Extract ID from URL
 907                 mobj = re.match(self._VALID_URL, url)
 908                 if mobj is None:
 909                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 910                         return
 911
 912                 video_id = mobj.group(2)
 913                 video_extension = 'flv'
 914
 915                 # Rewrite valid but non-extractable URLs as
 916                 # extractable English language /watch/ URLs
 917                 if re.match(self._VPAGE_URL, url) is None:
 918                         request = urllib2.Request(url)
 919                         try:
 920                                 webpage = urllib2.urlopen(request).read()
 921                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 922                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 923                                 return
 924
 925                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 926                         if mobj is None:
 927                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 928                                 return
 929                         yahoo_id = mobj.group(1)
 930
 931                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 932                         if mobj is None:
 933                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 934                                 return
 935                         yahoo_vid = mobj.group(1)
 936
 937                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 938                         return self._real_extract(url, new_video=False)
 939
 940                 # Retrieve video webpage to extract further information
 941                 request = urllib2.Request(url)
 942                 try:
 943                         self.report_download_webpage(video_id)
 944                         webpage = urllib2.urlopen(request).read()
 945                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 946                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 947                         return
 948
 949                 # Extract uploader and title from webpage
 950                 self.report_extraction(video_id)
 951                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 952                 if mobj is None:
 953                         self._downloader.trouble(u'ERROR: unable to extract video title')
 954                         return
 955                 video_title = mobj.group(1).decode('utf-8')
 956
 957                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 958                 if mobj is None:
 959                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 960                         return
 961                 video_uploader = mobj.group(1).decode('utf-8')
 962
 963                 # Extract video thumbnail
 964                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 965                 if mobj is None:
 966                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 967                         return
 968                 video_thumbnail = mobj.group(1).decode('utf-8')
 969
 970                 # Extract video description
 971                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 972                 if mobj is None:
 973                         self._downloader.trouble(u'ERROR: unable to extract video description')
 974                         return
 975                 video_description = mobj.group(1).decode('utf-8')
 976                 if not video_description:
 977                         video_description = 'No description available.'
 978
 979                 # Extract video height and width
 980                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 981                 if mobj is None:
 982                         self._downloader.trouble(u'ERROR: unable to extract video height')
 983                         return
 984                 yv_video_height = mobj.group(1)
 985
 986                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 987                 if mobj is None:
 988                         self._downloader.trouble(u'ERROR: unable to extract video width')
 989                         return
 990                 yv_video_width = mobj.group(1)
 991
 992                 # Retrieve video playlist to extract media URL
 993                 # I'm not completely sure what all these options are, but we
 994                 # seem to need most of them, otherwise the server sends a 401.
 995                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 996                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 997                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 998                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 999                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1000                 try:
1001                         self.report_download_webpage(video_id)
1002                         webpage = urllib2.urlopen(request).read()
1003                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1005                         return
1006
1007                 # Extract media URL from playlist XML
1008                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1011                         return
1012                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1013                 video_url = unescapeHTML(video_url)
1014
1015                 return [{
1016                         'id':           video_id.decode('utf-8'),
1017                         'url':          video_url,
1018                         'uploader':     video_uploader,
1019                         'upload_date':  u'NA',
1020                         'title':        video_title,
1021                         'ext':          video_extension.decode('utf-8'),
1022                         'thumbnail':    video_thumbnail.decode('utf-8'),
1023                         'description':  video_description,
1024                         'thumbnail':    video_thumbnail,
1025                         'player_url':   None,
1026                 }]
1027
1028
1029 class VimeoIE(InfoExtractor):
1030         """Information extractor for vimeo.com."""
1031
1032         # _VALID_URL matches Vimeo URLs
1033         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1034         IE_NAME = u'vimeo'
1035
1036         def __init__(self, downloader=None):
1037                 InfoExtractor.__init__(self, downloader)
1038
1039         def report_download_webpage(self, video_id):
1040                 """Report webpage download."""
1041                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1042
1043         def report_extraction(self, video_id):
1044                 """Report information extraction."""
1045                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1046
1047         def _real_extract(self, url, new_video=True):
1048                 # Extract ID from URL
1049                 mobj = re.match(self._VALID_URL, url)
1050                 if mobj is None:
1051                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1052                         return
1053
1054                 video_id = mobj.group(1)
1055
1056                 # Retrieve video webpage to extract further information
1057                 request = urllib2.Request(url, None, std_headers)
1058                 try:
1059                         self.report_download_webpage(video_id)
1060                         webpage = urllib2.urlopen(request).read()
1061                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1062                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1063                         return
1064
1065                 # Now we begin extracting as much information as we can from what we
1066                 # retrieved. First we extract the information common to all extractors,
1067                 # and latter we extract those that are Vimeo specific.
1068                 self.report_extraction(video_id)
1069
1070                 # Extract the config JSON
1071                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072                 try:
1073                         config = json.loads(config)
1074                 except:
1075                         self._downloader.trouble(u'ERROR: unable to extract info section')
1076                         return
1077
1078                 # Extract title
1079                 video_title = config["video"]["title"]
1080
1081                 # Extract uploader
1082                 video_uploader = config["video"]["owner"]["name"]
1083
1084                 # Extract video thumbnail
1085                 video_thumbnail = config["video"]["thumbnail"]
1086
1087                 # Extract video description
1088                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1089                 if video_description: video_description = clean_html(video_description)
1090                 else: video_description = ''
1091
1092                 # Extract upload date
1093                 video_upload_date = u'NA'
1094                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1095                 if mobj is not None:
1096                         video_upload_date = mobj.group(1)
1097
1098                 # Vimeo specific: extract request signature and timestamp
1099                 sig = config['request']['signature']
1100                 timestamp = config['request']['timestamp']
1101
1102                 # Vimeo specific: extract video codec and quality information
1103                 # First consider quality, then codecs, then take everything
1104                 # TODO bind to format param
1105                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106                 files = { 'hd': [], 'sd': [], 'other': []}
1107                 for codec_name, codec_extension in codecs:
1108                         if codec_name in config["video"]["files"]:
1109                                 if 'hd' in config["video"]["files"][codec_name]:
1110                                         files['hd'].append((codec_name, codec_extension, 'hd'))
1111                                 elif 'sd' in config["video"]["files"][codec_name]:
1112                                         files['sd'].append((codec_name, codec_extension, 'sd'))
1113                                 else:
1114                                         files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1115
1116                 for quality in ('hd', 'sd', 'other'):
1117                         if len(files[quality]) > 0:
1118                                 video_quality = files[quality][0][2]
1119                                 video_codec = files[quality][0][0]
1120                                 video_extension = files[quality][0][1]
1121                                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1122                                 break
1123                 else:
1124                         self._downloader.trouble(u'ERROR: no known codec found')
1125                         return
1126
1127                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128                                         %(video_id, sig, timestamp, video_quality, video_codec.upper())
1129
1130                 return [{
1131                         'id':           video_id,
1132                         'url':          video_url,
1133                         'uploader':     video_uploader,
1134                         'upload_date':  video_upload_date,
1135                         'title':        video_title,
1136                         'ext':          video_extension,
1137                         'thumbnail':    video_thumbnail,
1138                         'description':  video_description,
1139                         'player_url':   None,
1140                 }]
1141
1142
1143 class GenericIE(InfoExtractor):
1144         """Generic last-resort information extractor."""
1145
1146         _VALID_URL = r'.*'
1147         IE_NAME = u'generic'
1148
1149         def __init__(self, downloader=None):
1150                 InfoExtractor.__init__(self, downloader)
1151
1152         def report_download_webpage(self, video_id):
1153                 """Report webpage download."""
1154                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1155                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1156
1157         def report_extraction(self, video_id):
1158                 """Report information extraction."""
1159                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1160
1161         def report_following_redirect(self, new_url):
1162                 """Report information extraction."""
1163                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1164
1165         def _test_redirect(self, url):
1166                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1167                 class HeadRequest(urllib2.Request):
1168                         def get_method(self):
1169                                 return "HEAD"
1170
1171                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1172                         """
1173                         Subclass the HTTPRedirectHandler to make it use our
1174                         HeadRequest also on the redirected URL
1175                         """
1176                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1177                                 if code in (301, 302, 303, 307):
1178                                         newurl = newurl.replace(' ', '%20')
1179                                         newheaders = dict((k,v) for k,v in req.headers.items()
1180                                                                           if k.lower() not in ("content-length", "content-type"))
1181                                         return HeadRequest(newurl,
1182                                                                            headers=newheaders,
1183                                                                            origin_req_host=req.get_origin_req_host(),
1184                                                                            unverifiable=True)
1185                                 else:
1186                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1187
1188                 class HTTPMethodFallback(urllib2.BaseHandler):
1189                         """
1190                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1191                         """
1192                         def http_error_405(self, req, fp, code, msg, headers):
1193                                 fp.read()
1194                                 fp.close()
1195
1196                                 newheaders = dict((k,v) for k,v in req.headers.items()
1197                                                                   if k.lower() not in ("content-length", "content-type"))
1198                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1199                                                                                                  headers=newheaders,
1200                                                                                                  origin_req_host=req.get_origin_req_host(),
1201                                                                                                  unverifiable=True))
1202
1203                 # Build our opener
1204                 opener = urllib2.OpenerDirector()
1205                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1206                                                 HTTPMethodFallback, HEADRedirectHandler,
1207                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1208                         opener.add_handler(handler())
1209
1210                 response = opener.open(HeadRequest(url))
1211                 new_url = response.geturl()
1212
1213                 if url == new_url: return False
1214
1215                 self.report_following_redirect(new_url)
1216                 self._downloader.download([new_url])
1217                 return True
1218
1219         def _real_extract(self, url):
1220                 if self._test_redirect(url): return
1221
1222                 video_id = url.split('/')[-1]
1223                 request = urllib2.Request(url)
1224                 try:
1225                         self.report_download_webpage(video_id)
1226                         webpage = urllib2.urlopen(request).read()
1227                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1229                         return
1230                 except ValueError, err:
1231                         # since this is the last-resort InfoExtractor, if
1232                         # this error is thrown, it'll be thrown here
1233                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1234                         return
1235
1236                 self.report_extraction(video_id)
1237                 # Start with something easy: JW Player in SWFObject
1238                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1239                 if mobj is None:
1240                         # Broaden the search a little bit
1241                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1242                 if mobj is None:
1243                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1244                         return
1245
1246                 # It's possible that one of the regexes
1247                 # matched, but returned an empty group:
1248                 if mobj.group(1) is None:
1249                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1250                         return
1251
1252                 video_url = urllib.unquote(mobj.group(1))
1253                 video_id = os.path.basename(video_url)
1254
1255                 # here's a fun little line of code for you:
1256                 video_extension = os.path.splitext(video_id)[1][1:]
1257                 video_id = os.path.splitext(video_id)[0]
1258
1259                 # it's tempting to parse this further, but you would
1260                 # have to take into account all the variations like
1261                 #   Video Title - Site Name
1262                 #   Site Name | Video Title
1263                 #   Video Title - Tagline | Site Name
1264                 # and so on and so forth; it's just not practical
1265                 mobj = re.search(r'<title>(.*)</title>', webpage)
1266                 if mobj is None:
1267                         self._downloader.trouble(u'ERROR: unable to extract title')
1268                         return
1269                 video_title = mobj.group(1).decode('utf-8')
1270
1271                 # video uploader is domain name
1272                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1273                 if mobj is None:
1274                         self._downloader.trouble(u'ERROR: unable to extract title')
1275                         return
1276                 video_uploader = mobj.group(1).decode('utf-8')
1277
1278                 return [{
1279                         'id':           video_id.decode('utf-8'),
1280                         'url':          video_url.decode('utf-8'),
1281                         'uploader':     video_uploader,
1282                         'upload_date':  u'NA',
1283                         'title':        video_title,
1284                         'ext':          video_extension.decode('utf-8'),
1285                         'format':       u'NA',
1286                         'player_url':   None,
1287                 }]
1288
1289
1290 class YoutubeSearchIE(InfoExtractor):
1291         """Information Extractor for YouTube search queries."""
1292         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1293         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1294         _max_youtube_results = 1000
1295         IE_NAME = u'youtube:search'
1296
1297         def __init__(self, downloader=None):
1298                 InfoExtractor.__init__(self, downloader)
1299
1300         def report_download_page(self, query, pagenum):
1301                 """Report attempt to download search page with given number."""
1302                 query = query.decode(preferredencoding())
1303                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1304
1305         def _real_extract(self, query):
1306                 mobj = re.match(self._VALID_URL, query)
1307                 if mobj is None:
1308                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1309                         return
1310
1311                 prefix, query = query.split(':')
1312                 prefix = prefix[8:]
1313                 query = query.encode('utf-8')
1314                 if prefix == '':
1315                         self._download_n_results(query, 1)
1316                         return
1317                 elif prefix == 'all':
1318                         self._download_n_results(query, self._max_youtube_results)
1319                         return
1320                 else:
1321                         try:
1322                                 n = long(prefix)
1323                                 if n <= 0:
1324                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1325                                         return
1326                                 elif n > self._max_youtube_results:
1327                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1328                                         n = self._max_youtube_results
1329                                 self._download_n_results(query, n)
1330                                 return
1331                         except ValueError: # parsing prefix as integer fails
1332                                 self._download_n_results(query, 1)
1333                                 return
1334
1335         def _download_n_results(self, query, n):
1336                 """Downloads a specified number of results for a query"""
1337
1338                 video_ids = []
1339                 pagenum = 0
1340                 limit = n
1341
1342                 while (50 * pagenum) < limit:
1343                         self.report_download_page(query, pagenum+1)
1344                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1345                         request = urllib2.Request(result_url)
1346                         try:
1347                                 data = urllib2.urlopen(request).read()
1348                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1349                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1350                                 return
1351                         api_response = json.loads(data)['data']
1352
1353                         new_ids = list(video['id'] for video in api_response['items'])
1354                         video_ids += new_ids
1355
1356                         limit = min(n, api_response['totalItems'])
1357                         pagenum += 1
1358
1359                 if len(video_ids) > n:
1360                         video_ids = video_ids[:n]
1361                 for id in video_ids:
1362                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1363                 return
1364
1365
1366 class GoogleSearchIE(InfoExtractor):
1367         """Information Extractor for Google Video search queries."""
1368         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1369         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1370         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1371         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1372         _max_google_results = 1000
1373         IE_NAME = u'video.google:search'
1374
1375         def __init__(self, downloader=None):
1376                 InfoExtractor.__init__(self, downloader)
1377
1378         def report_download_page(self, query, pagenum):
1379                 """Report attempt to download playlist page with given number."""
1380                 query = query.decode(preferredencoding())
1381                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1382
1383         def _real_extract(self, query):
1384                 mobj = re.match(self._VALID_URL, query)
1385                 if mobj is None:
1386                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1387                         return
1388
1389                 prefix, query = query.split(':')
1390                 prefix = prefix[8:]
1391                 query = query.encode('utf-8')
1392                 if prefix == '':
1393                         self._download_n_results(query, 1)
1394                         return
1395                 elif prefix == 'all':
1396                         self._download_n_results(query, self._max_google_results)
1397                         return
1398                 else:
1399                         try:
1400                                 n = long(prefix)
1401                                 if n <= 0:
1402                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1403                                         return
1404                                 elif n > self._max_google_results:
1405                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1406                                         n = self._max_google_results
1407                                 self._download_n_results(query, n)
1408                                 return
1409                         except ValueError: # parsing prefix as integer fails
1410                                 self._download_n_results(query, 1)
1411                                 return
1412
1413         def _download_n_results(self, query, n):
1414                 """Downloads a specified number of results for a query"""
1415
1416                 video_ids = []
1417                 pagenum = 0
1418
1419                 while True:
1420                         self.report_download_page(query, pagenum)
1421                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1422                         request = urllib2.Request(result_url)
1423                         try:
1424                                 page = urllib2.urlopen(request).read()
1425                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1426                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1427                                 return
1428
1429                         # Extract video identifiers
1430                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1431                                 video_id = mobj.group(1)
1432                                 if video_id not in video_ids:
1433                                         video_ids.append(video_id)
1434                                         if len(video_ids) == n:
1435                                                 # Specified n videos reached
1436                                                 for id in video_ids:
1437                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1438                                                 return
1439
1440                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1441                                 for id in video_ids:
1442                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1443                                 return
1444
1445                         pagenum = pagenum + 1
1446
1447
1448 class YahooSearchIE(InfoExtractor):
1449         """Information Extractor for Yahoo! Video search queries."""
1450         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1451         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1452         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1453         _MORE_PAGES_INDICATOR = r'\s*Next'
1454         _max_yahoo_results = 1000
1455         IE_NAME = u'video.yahoo:search'
1456
1457         def __init__(self, downloader=None):
1458                 InfoExtractor.__init__(self, downloader)
1459
1460         def report_download_page(self, query, pagenum):
1461                 """Report attempt to download playlist page with given number."""
1462                 query = query.decode(preferredencoding())
1463                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1464
1465         def _real_extract(self, query):
1466                 mobj = re.match(self._VALID_URL, query)
1467                 if mobj is None:
1468                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1469                         return
1470
1471                 prefix, query = query.split(':')
1472                 prefix = prefix[8:]
1473                 query = query.encode('utf-8')
1474                 if prefix == '':
1475                         self._download_n_results(query, 1)
1476                         return
1477                 elif prefix == 'all':
1478                         self._download_n_results(query, self._max_yahoo_results)
1479                         return
1480                 else:
1481                         try:
1482                                 n = long(prefix)
1483                                 if n <= 0:
1484                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1485                                         return
1486                                 elif n > self._max_yahoo_results:
1487                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1488                                         n = self._max_yahoo_results
1489                                 self._download_n_results(query, n)
1490                                 return
1491                         except ValueError: # parsing prefix as integer fails
1492                                 self._download_n_results(query, 1)
1493                                 return
1494
1495         def _download_n_results(self, query, n):
1496                 """Downloads a specified number of results for a query"""
1497
1498                 video_ids = []
1499                 already_seen = set()
1500                 pagenum = 1
1501
1502                 while True:
1503                         self.report_download_page(query, pagenum)
1504                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1505                         request = urllib2.Request(result_url)
1506                         try:
1507                                 page = urllib2.urlopen(request).read()
1508                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1509                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1510                                 return
1511
1512                         # Extract video identifiers
1513                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514                                 video_id = mobj.group(1)
1515                                 if video_id not in already_seen:
1516                                         video_ids.append(video_id)
1517                                         already_seen.add(video_id)
1518                                         if len(video_ids) == n:
1519                                                 # Specified n videos reached
1520                                                 for id in video_ids:
1521                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1522                                                 return
1523
1524                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1525                                 for id in video_ids:
1526                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1527                                 return
1528
1529                         pagenum = pagenum + 1
1530
1531
1532 class YoutubePlaylistIE(InfoExtractor):
1533         """Information Extractor for YouTube playlists."""
1534
1535         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1536         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1537         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1538         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1539         IE_NAME = u'youtube:playlist'
1540
1541         def __init__(self, downloader=None):
1542                 InfoExtractor.__init__(self, downloader)
1543
1544         def report_download_page(self, playlist_id, pagenum):
1545                 """Report attempt to download playlist page with given number."""
1546                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1547
1548         def _real_extract(self, url):
1549                 # Extract playlist id
1550                 mobj = re.match(self._VALID_URL, url)
1551                 if mobj is None:
1552                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1553                         return
1554
1555                 # Single video case
1556                 if mobj.group(3) is not None:
1557                         self._downloader.download([mobj.group(3)])
1558                         return
1559
1560                 # Download playlist pages
1561                 # prefix is 'p' as default for playlists but there are other types that need extra care
1562                 playlist_prefix = mobj.group(1)
1563                 if playlist_prefix == 'a':
1564                         playlist_access = 'artist'
1565                 else:
1566                         playlist_prefix = 'p'
1567                         playlist_access = 'view_play_list'
1568                 playlist_id = mobj.group(2)
1569                 video_ids = []
1570                 pagenum = 1
1571
1572                 while True:
1573                         self.report_download_page(playlist_id, pagenum)
1574                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1575                         request = urllib2.Request(url)
1576                         try:
1577                                 page = urllib2.urlopen(request).read()
1578                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1579                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1580                                 return
1581
1582                         # Extract video identifiers
1583                         ids_in_page = []
1584                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1585                                 if mobj.group(1) not in ids_in_page:
1586                                         ids_in_page.append(mobj.group(1))
1587                         video_ids.extend(ids_in_page)
1588
1589                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1590                                 break
1591                         pagenum = pagenum + 1
1592
1593                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1594                 playlistend = self._downloader.params.get('playlistend', -1)
1595                 if playlistend == -1:
1596                         video_ids = video_ids[playliststart:]
1597                 else:
1598                         video_ids = video_ids[playliststart:playlistend]
1599
1600                 for id in video_ids:
1601                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1602                 return
1603
1604
1605 class YoutubeChannelIE(InfoExtractor):
1606         """Information Extractor for YouTube channels."""
1607
1608         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1609         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1610         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1611         IE_NAME = u'youtube:channel'
1612
1613         def report_download_page(self, channel_id, pagenum):
1614                 """Report attempt to download channel page with given number."""
1615                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1616
1617         def _real_extract(self, url):
1618                 # Extract channel id
1619                 mobj = re.match(self._VALID_URL, url)
1620                 if mobj is None:
1621                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1622                         return
1623
1624                 # Download channel pages
1625                 channel_id = mobj.group(1)
1626                 video_ids = []
1627                 pagenum = 1
1628
1629                 while True:
1630                         self.report_download_page(channel_id, pagenum)
1631                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1632                         request = urllib2.Request(url)
1633                         try:
1634                                 page = urllib2.urlopen(request).read()
1635                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1637                                 return
1638
1639                         # Extract video identifiers
1640                         ids_in_page = []
1641                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1642                                 if mobj.group(1) not in ids_in_page:
1643                                         ids_in_page.append(mobj.group(1))
1644                         video_ids.extend(ids_in_page)
1645
1646                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1647                                 break
1648                         pagenum = pagenum + 1
1649
1650                 for id in video_ids:
1651                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1652                 return
1653
1654
1655 class YoutubeUserIE(InfoExtractor):
1656         """Information Extractor for YouTube users."""
1657
1658         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1659         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1660         _GDATA_PAGE_SIZE = 50
1661         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1662         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1663         IE_NAME = u'youtube:user'
1664
1665         def __init__(self, downloader=None):
1666                 InfoExtractor.__init__(self, downloader)
1667
1668         def report_download_page(self, username, start_index):
1669                 """Report attempt to download user page."""
1670                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1671                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1672
1673         def _real_extract(self, url):
1674                 # Extract username
1675                 mobj = re.match(self._VALID_URL, url)
1676                 if mobj is None:
1677                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1678                         return
1679
1680                 username = mobj.group(1)
1681
1682                 # Download video ids using YouTube Data API. Result size per
1683                 # query is limited (currently to 50 videos) so we need to query
1684                 # page by page until there are no video ids - it means we got
1685                 # all of them.
1686
1687                 video_ids = []
1688                 pagenum = 0
1689
1690                 while True:
1691                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1692                         self.report_download_page(username, start_index)
1693
1694                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1695
1696                         try:
1697                                 page = urllib2.urlopen(request).read()
1698                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1699                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1700                                 return
1701
1702                         # Extract video identifiers
1703                         ids_in_page = []
1704
1705                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1706                                 if mobj.group(1) not in ids_in_page:
1707                                         ids_in_page.append(mobj.group(1))
1708
1709                         video_ids.extend(ids_in_page)
1710
1711                         # A little optimization - if current page is not
1712                         # "full", ie. does not contain PAGE_SIZE video ids then
1713                         # we can assume that this page is the last one - there
1714                         # are no more ids on further pages - no need to query
1715                         # again.
1716
1717                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1718                                 break
1719
1720                         pagenum += 1
1721
1722                 all_ids_count = len(video_ids)
1723                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1724                 playlistend = self._downloader.params.get('playlistend', -1)
1725
1726                 if playlistend == -1:
1727                         video_ids = video_ids[playliststart:]
1728                 else:
1729                         video_ids = video_ids[playliststart:playlistend]
1730
1731                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1732                                 (username, all_ids_count, len(video_ids)))
1733
1734                 for video_id in video_ids:
1735                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1736
1737
1738 class BlipTVUserIE(InfoExtractor):
1739         """Information Extractor for blip.tv users."""
1740
1741         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1742         _PAGE_SIZE = 12
1743         IE_NAME = u'blip.tv:user'
1744
1745         def __init__(self, downloader=None):
1746                 InfoExtractor.__init__(self, downloader)
1747
1748         def report_download_page(self, username, pagenum):
1749                 """Report attempt to download user page."""
1750                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1751                                 (self.IE_NAME, username, pagenum))
1752
1753         def _real_extract(self, url):
1754                 # Extract username
1755                 mobj = re.match(self._VALID_URL, url)
1756                 if mobj is None:
1757                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1758                         return
1759
1760                 username = mobj.group(1)
1761
1762                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1763
1764                 request = urllib2.Request(url)
1765
1766                 try:
1767                         page = urllib2.urlopen(request).read().decode('utf-8')
1768                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1769                         page_base = page_base % mobj.group(1)
1770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1771                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1772                         return
1773
1774
1775                 # Download video ids using BlipTV Ajax calls. Result size per
1776                 # query is limited (currently to 12 videos) so we need to query
1777                 # page by page until there are no video ids - it means we got
1778                 # all of them.
1779
1780                 video_ids = []
1781                 pagenum = 1
1782
1783                 while True:
1784                         self.report_download_page(username, pagenum)
1785
1786                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1787
1788                         try:
1789                                 page = urllib2.urlopen(request).read().decode('utf-8')
1790                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1792                                 return
1793
1794                         # Extract video identifiers
1795                         ids_in_page = []
1796
1797                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1798                                 if mobj.group(1) not in ids_in_page:
1799                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1800
1801                         video_ids.extend(ids_in_page)
1802
1803                         # A little optimization - if current page is not
1804                         # "full", ie. does not contain PAGE_SIZE video ids then
1805                         # we can assume that this page is the last one - there
1806                         # are no more ids on further pages - no need to query
1807                         # again.
1808
1809                         if len(ids_in_page) < self._PAGE_SIZE:
1810                                 break
1811
1812                         pagenum += 1
1813
1814                 all_ids_count = len(video_ids)
1815                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1816                 playlistend = self._downloader.params.get('playlistend', -1)
1817
1818                 if playlistend == -1:
1819                         video_ids = video_ids[playliststart:]
1820                 else:
1821                         video_ids = video_ids[playliststart:playlistend]
1822
1823                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1824                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1825
1826                 for video_id in video_ids:
1827                         self._downloader.download([u'http://blip.tv/'+video_id])
1828
1829
1830 class DepositFilesIE(InfoExtractor):
1831         """Information extractor for depositfiles.com"""
1832
1833         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1834         IE_NAME = u'DepositFiles'
1835
1836         def __init__(self, downloader=None):
1837                 InfoExtractor.__init__(self, downloader)
1838
1839         def report_download_webpage(self, file_id):
1840                 """Report webpage download."""
1841                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1842
1843         def report_extraction(self, file_id):
1844                 """Report information extraction."""
1845                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1846
1847         def _real_extract(self, url):
1848                 file_id = url.split('/')[-1]
1849                 # Rebuild url in english locale
1850                 url = 'http://depositfiles.com/en/files/' + file_id
1851
1852                 # Retrieve file webpage with 'Free download' button pressed
1853                 free_download_indication = { 'gateway_result' : '1' }
1854                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1855                 try:
1856                         self.report_download_webpage(file_id)
1857                         webpage = urllib2.urlopen(request).read()
1858                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1860                         return
1861
1862                 # Search for the real file URL
1863                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1864                 if (mobj is None) or (mobj.group(1) is None):
1865                         # Try to figure out reason of the error.
1866                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1867                         if (mobj is not None) and (mobj.group(1) is not None):
1868                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1869                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1870                         else:
1871                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1872                         return
1873
1874                 file_url = mobj.group(1)
1875                 file_extension = os.path.splitext(file_url)[1][1:]
1876
1877                 # Search for file title
1878                 mobj = re.search(r'<b title="(.*?)">', webpage)
1879                 if mobj is None:
1880                         self._downloader.trouble(u'ERROR: unable to extract title')
1881                         return
1882                 file_title = mobj.group(1).decode('utf-8')
1883
1884                 return [{
1885                         'id':           file_id.decode('utf-8'),
1886                         'url':          file_url.decode('utf-8'),
1887                         'uploader':     u'NA',
1888                         'upload_date':  u'NA',
1889                         'title':        file_title,
1890                         'ext':          file_extension.decode('utf-8'),
1891                         'format':       u'NA',
1892                         'player_url':   None,
1893                 }]
1894
1895
1896 class FacebookIE(InfoExtractor):
1897         """Information Extractor for Facebook"""
1898
1899         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1900         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1901         _NETRC_MACHINE = 'facebook'
1902         _available_formats = ['video', 'highqual', 'lowqual']
1903         _video_extensions = {
1904                 'video': 'mp4',
1905                 'highqual': 'mp4',
1906                 'lowqual': 'mp4',
1907         }
1908         IE_NAME = u'facebook'
1909
1910         def __init__(self, downloader=None):
1911                 InfoExtractor.__init__(self, downloader)
1912
1913         def _reporter(self, message):
1914                 """Add header and report message."""
1915                 self._downloader.to_screen(u'[facebook] %s' % message)
1916
1917         def report_login(self):
1918                 """Report attempt to log in."""
1919                 self._reporter(u'Logging in')
1920
1921         def report_video_webpage_download(self, video_id):
1922                 """Report attempt to download video webpage."""
1923                 self._reporter(u'%s: Downloading video webpage' % video_id)
1924
1925         def report_information_extraction(self, video_id):
1926                 """Report attempt to extract video information."""
1927                 self._reporter(u'%s: Extracting video information' % video_id)
1928
1929         def _parse_page(self, video_webpage):
1930                 """Extract video information from page"""
1931                 # General data
1932                 data = {'title': r'\("video_title", "(.*?)"\)',
1933                         'description': r'<div class="datawrap">(.*?)</div>',
1934                         'owner': r'\("video_owner_name", "(.*?)"\)',
1935                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1936                         }
1937                 video_info = {}
1938                 for piece in data.keys():
1939                         mobj = re.search(data[piece], video_webpage)
1940                         if mobj is not None:
1941                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1942
1943                 # Video urls
1944                 video_urls = {}
1945                 for fmt in self._available_formats:
1946                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1947                         if mobj is not None:
1948                                 # URL is in a Javascript segment inside an escaped Unicode format within
1949                                 # the generally utf-8 page
1950                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1951                 video_info['video_urls'] = video_urls
1952
1953                 return video_info
1954
1955         def _real_initialize(self):
1956                 if self._downloader is None:
1957                         return
1958
1959                 useremail = None
1960                 password = None
1961                 downloader_params = self._downloader.params
1962
1963                 # Attempt to use provided username and password or .netrc data
1964                 if downloader_params.get('username', None) is not None:
1965                         useremail = downloader_params['username']
1966                         password = downloader_params['password']
1967                 elif downloader_params.get('usenetrc', False):
1968                         try:
1969                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1970                                 if info is not None:
1971                                         useremail = info[0]
1972                                         password = info[2]
1973                                 else:
1974                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1975                         except (IOError, netrc.NetrcParseError), err:
1976                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1977                                 return
1978
1979                 if useremail is None:
1980                         return
1981
1982                 # Log in
1983                 login_form = {
1984                         'email': useremail,
1985                         'pass': password,
1986                         'login': 'Log+In'
1987                         }
1988                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1989                 try:
1990                         self.report_login()
1991                         login_results = urllib2.urlopen(request).read()
1992                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1993                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1994                                 return
1995                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1996                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1997                         return
1998
1999         def _real_extract(self, url):
2000                 mobj = re.match(self._VALID_URL, url)
2001                 if mobj is None:
2002                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2003                         return
2004                 video_id = mobj.group('ID')
2005
2006                 # Get video webpage
2007                 self.report_video_webpage_download(video_id)
2008                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2009                 try:
2010                         page = urllib2.urlopen(request)
2011                         video_webpage = page.read()
2012                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2013                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2014                         return
2015
2016                 # Start extracting information
2017                 self.report_information_extraction(video_id)
2018
2019                 # Extract information
2020                 video_info = self._parse_page(video_webpage)
2021
2022                 # uploader
2023                 if 'owner' not in video_info:
2024                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2025                         return
2026                 video_uploader = video_info['owner']
2027
2028                 # title
2029                 if 'title' not in video_info:
2030                         self._downloader.trouble(u'ERROR: unable to extract video title')
2031                         return
2032                 video_title = video_info['title']
2033                 video_title = video_title.decode('utf-8')
2034
2035                 # thumbnail image
2036                 if 'thumbnail' not in video_info:
2037                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2038                         video_thumbnail = ''
2039                 else:
2040                         video_thumbnail = video_info['thumbnail']
2041
2042                 # upload date
2043                 upload_date = u'NA'
2044                 if 'upload_date' in video_info:
2045                         upload_time = video_info['upload_date']
2046                         timetuple = email.utils.parsedate_tz(upload_time)
2047                         if timetuple is not None:
2048                                 try:
2049                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2050                                 except:
2051                                         pass
2052
2053                 # description
2054                 video_description = video_info.get('description', 'No description available.')
2055
2056                 url_map = video_info['video_urls']
2057                 if len(url_map.keys()) > 0:
2058                         # Decide which formats to download
2059                         req_format = self._downloader.params.get('format', None)
2060                         format_limit = self._downloader.params.get('format_limit', None)
2061
2062                         if format_limit is not None and format_limit in self._available_formats:
2063                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2064                         else:
2065                                 format_list = self._available_formats
2066                         existing_formats = [x for x in format_list if x in url_map]
2067                         if len(existing_formats) == 0:
2068                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2069                                 return
2070                         if req_format is None:
2071                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2072                         elif req_format == 'worst':
2073                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2074                         elif req_format == '-1':
2075                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2076                         else:
2077                                 # Specific format
2078                                 if req_format not in url_map:
2079                                         self._downloader.trouble(u'ERROR: requested format not available')
2080                                         return
2081                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2082
2083                 results = []
2084                 for format_param, video_real_url in video_url_list:
2085                         # Extension
2086                         video_extension = self._video_extensions.get(format_param, 'mp4')
2087
2088                         results.append({
2089                                 'id':           video_id.decode('utf-8'),
2090                                 'url':          video_real_url.decode('utf-8'),
2091                                 'uploader':     video_uploader.decode('utf-8'),
2092                                 'upload_date':  upload_date,
2093                                 'title':        video_title,
2094                                 'ext':          video_extension.decode('utf-8'),
2095                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2096                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2097                                 'description':  video_description.decode('utf-8'),
2098                                 'player_url':   None,
2099                         })
2100                 return results
2101
2102 class BlipTVIE(InfoExtractor):
2103         """Information extractor for blip.tv"""
2104
2105         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2106         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2107         IE_NAME = u'blip.tv'
2108
2109         def report_extraction(self, file_id):
2110                 """Report information extraction."""
2111                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2112
2113         def report_direct_download(self, title):
2114                 """Report information extraction."""
2115                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2116
2117         def _real_extract(self, url):
2118                 mobj = re.match(self._VALID_URL, url)
2119                 if mobj is None:
2120                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2121                         return
2122
2123                 if '?' in url:
2124                         cchar = '&'
2125                 else:
2126                         cchar = '?'
2127                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2128                 request = urllib2.Request(json_url.encode('utf-8'))
2129                 self.report_extraction(mobj.group(1))
2130                 info = None
2131                 try:
2132                         urlh = urllib2.urlopen(request)
2133                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2134                                 basename = url.split('/')[-1]
2135                                 title,ext = os.path.splitext(basename)
2136                                 title = title.decode('UTF-8')
2137                                 ext = ext.replace('.', '')
2138                                 self.report_direct_download(title)
2139                                 info = {
2140                                         'id': title,
2141                                         'url': url,
2142                                         'title': title,
2143                                         'ext': ext,
2144                                         'urlhandle': urlh
2145                                 }
2146                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2147                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2148                         return
2149                 if info is None: # Regular URL
2150                         try:
2151                                 json_code = urlh.read()
2152                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2153                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2154                                 return
2155
2156                         try:
2157                                 json_data = json.loads(json_code)
2158                                 if 'Post' in json_data:
2159                                         data = json_data['Post']
2160                                 else:
2161                                         data = json_data
2162
2163                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2164                                 video_url = data['media']['url']
2165                                 umobj = re.match(self._URL_EXT, video_url)
2166                                 if umobj is None:
2167                                         raise ValueError('Can not determine filename extension')
2168                                 ext = umobj.group(1)
2169
2170                                 info = {
2171                                         'id': data['item_id'],
2172                                         'url': video_url,
2173                                         'uploader': data['display_name'],
2174                                         'upload_date': upload_date,
2175                                         'title': data['title'],
2176                                         'ext': ext,
2177                                         'format': data['media']['mimeType'],
2178                                         'thumbnail': data['thumbnailUrl'],
2179                                         'description': data['description'],
2180                                         'player_url': data['embedUrl']
2181                                 }
2182                         except (ValueError,KeyError), err:
2183                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2184                                 return
2185
2186                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2187                 return [info]
2188
2189
2190 class MyVideoIE(InfoExtractor):
2191         """Information Extractor for myvideo.de."""
2192
2193         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2194         IE_NAME = u'myvideo'
2195
2196         def __init__(self, downloader=None):
2197                 InfoExtractor.__init__(self, downloader)
2198
2199         def report_download_webpage(self, video_id):
2200                 """Report webpage download."""
2201                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2202
2203         def report_extraction(self, video_id):
2204                 """Report information extraction."""
2205                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2206
2207         def _real_extract(self,url):
2208                 mobj = re.match(self._VALID_URL, url)
2209                 if mobj is None:
2210                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2211                         return
2212
2213                 video_id = mobj.group(1)
2214
2215                 # Get video webpage
2216                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2217                 try:
2218                         self.report_download_webpage(video_id)
2219                         webpage = urllib2.urlopen(request).read()
2220                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2221                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2222                         return
2223
2224                 self.report_extraction(video_id)
2225                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2226                                  webpage)
2227                 if mobj is None:
2228                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2229                         return
2230                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2231
2232                 mobj = re.search('<title>([^<]+)</title>', webpage)
2233                 if mobj is None:
2234                         self._downloader.trouble(u'ERROR: unable to extract title')
2235                         return
2236
2237                 video_title = mobj.group(1)
2238
2239                 return [{
2240                         'id':           video_id,
2241                         'url':          video_url,
2242                         'uploader':     u'NA',
2243                         'upload_date':  u'NA',
2244                         'title':        video_title,
2245                         'ext':          u'flv',
2246                         'format':       u'NA',
2247                         'player_url':   None,
2248                 }]
2249
2250 class ComedyCentralIE(InfoExtractor):
2251         """Information extractor for The Daily Show and Colbert Report """
2252
2253         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2254         IE_NAME = u'comedycentral'
2255
2256         _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2257
2258         _video_extensions = {
2259                 '3500': 'mp4',
2260                 '2200': 'mp4',
2261                 '1700': 'mp4',
2262                 '1200': 'mp4',
2263                 '750': 'mp4',
2264                 '400': 'mp4',
2265         }
2266         _video_dimensions = {
2267                 '3500': '1280x720',
2268                 '2200': '960x540',
2269                 '1700': '768x432',
2270                 '1200': '640x360',
2271                 '750': '512x288',
2272                 '400': '384x216',
2273         }
2274
2275         def report_extraction(self, episode_id):
2276                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2277
2278         def report_config_download(self, episode_id):
2279                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2280
2281         def report_index_download(self, episode_id):
2282                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2283
2284         def report_player_url(self, episode_id):
2285                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2286
2287
2288         def _print_formats(self, formats):
2289                 print('Available formats:')
2290                 for x in formats:
2291                         print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2292
2293
2294
2295         def _real_extract(self, url):
2296                 mobj = re.match(self._VALID_URL, url)
2297                 if mobj is None:
2298                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2299                         return
2300
2301                 if mobj.group('shortname'):
2302                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2303                                 url = u'http://www.thedailyshow.com/full-episodes/'
2304                         else:
2305                                 url = u'http://www.colbertnation.com/full-episodes/'
2306                         mobj = re.match(self._VALID_URL, url)
2307                         assert mobj is not None
2308
2309                 dlNewest = not mobj.group('episode')
2310                 if dlNewest:
2311                         epTitle = mobj.group('showname')
2312                 else:
2313                         epTitle = mobj.group('episode')
2314
2315                 req = urllib2.Request(url)
2316                 self.report_extraction(epTitle)
2317                 try:
2318                         htmlHandle = urllib2.urlopen(req)
2319                         html = htmlHandle.read()
2320                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2321                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2322                         return
2323                 if dlNewest:
2324                         url = htmlHandle.geturl()
2325                         mobj = re.match(self._VALID_URL, url)
2326                         if mobj is None:
2327                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2328                                 return
2329                         if mobj.group('episode') == '':
2330                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2331                                 return
2332                         epTitle = mobj.group('episode')
2333
2334                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2335
2336                 if len(mMovieParams) == 0:
2337                         # The Colbert Report embeds the information in a without
2338                         # a URL prefix; so extract the alternate reference
2339                         # and then add the URL prefix manually.
2340
2341                         altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2342                         if len(altMovieParams) == 0:
2343                                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2344                                 return
2345                         else:
2346                                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2347
2348                 playerUrl_raw = mMovieParams[0][0]
2349                 self.report_player_url(epTitle)
2350                 try:
2351                         urlHandle = urllib2.urlopen(playerUrl_raw)
2352                         playerUrl = urlHandle.geturl()
2353                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2354                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2355                         return
2356
2357                 uri = mMovieParams[0][1]
2358                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2359                 self.report_index_download(epTitle)
2360                 try:
2361                         indexXml = urllib2.urlopen(indexUrl).read()
2362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2363                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2364                         return
2365
2366                 results = []
2367
2368                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2369                 itemEls = idoc.findall('.//item')
2370                 for itemEl in itemEls:
2371                         mediaId = itemEl.findall('./guid')[0].text
2372                         shortMediaId = mediaId.split(':')[-1]
2373                         showId = mediaId.split(':')[-2].replace('.com', '')
2374                         officialTitle = itemEl.findall('./title')[0].text
2375                         officialDate = itemEl.findall('./pubDate')[0].text
2376
2377                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2378                                                 urllib.urlencode({'uri': mediaId}))
2379                         configReq = urllib2.Request(configUrl)
2380                         self.report_config_download(epTitle)
2381                         try:
2382                                 configXml = urllib2.urlopen(configReq).read()
2383                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2384                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2385                                 return
2386
2387                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2388                         turls = []
2389                         for rendition in cdoc.findall('.//rendition'):
2390                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2391                                 turls.append(finfo)
2392
2393                         if len(turls) == 0:
2394                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2395                                 continue
2396
2397                         if self._downloader.params.get('listformats', None):
2398                             self._print_formats([i[0] for i in turls])
2399                             return
2400
2401                         # For now, just pick the highest bitrate
2402                         format,video_url = turls[-1]
2403
2404                         # Get the format arg from the arg stream
2405                         req_format = self._downloader.params.get('format', None)
2406
2407                         # Select format if we can find one
2408                         for f,v in turls:
2409                             if f == req_format:
2410                               format, video_url = f, v
2411                               break
2412
2413                         # Patch to download from alternative CDN, which does not
2414                         # break on current RTMPDump builds
2415
2416
2417                         broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2418                         better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2419
2420                         if video_url.startswith(broken_cdn):
2421                             video_url = video_url.replace(broken_cdn, better_cdn)
2422
2423
2424                         effTitle = showId + u'-' + epTitle
2425                         info = {
2426                                 'id': shortMediaId,
2427                                 'url': video_url,
2428                                 'uploader': showId,
2429                                 'upload_date': officialDate,
2430                                 'title': effTitle,
2431                                 'ext': 'mp4',
2432                                 'format': format,
2433                                 'thumbnail': None,
2434                                 'description': officialTitle,
2435                                 'player_url': None #playerUrl
2436                         }
2437
2438                         results.append(info)
2439
2440                 return results
2441
2442
2443 class EscapistIE(InfoExtractor):
2444         """Information extractor for The Escapist """
2445
2446         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2447         IE_NAME = u'escapist'
2448
2449         def report_extraction(self, showName):
2450                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2451
2452         def report_config_download(self, showName):
2453                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2454
2455         def _real_extract(self, url):
2456                 mobj = re.match(self._VALID_URL, url)
2457                 if mobj is None:
2458                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2459                         return
2460                 showName = mobj.group('showname')
2461                 videoId = mobj.group('episode')
2462
2463                 self.report_extraction(showName)
2464                 try:
2465                         webPage = urllib2.urlopen(url)
2466                         webPageBytes = webPage.read()
2467                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2468                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2469                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2471                         return
2472
2473                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2474                 description = unescapeHTML(descMatch.group(1))
2475                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2476                 imgUrl = unescapeHTML(imgMatch.group(1))
2477                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2478                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2479                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2480                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2481
2482                 self.report_config_download(showName)
2483                 try:
2484                         configJSON = urllib2.urlopen(configUrl).read()
2485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2487                         return
2488
2489                 # Technically, it's JavaScript, not JSON
2490                 configJSON = configJSON.replace("'", '"')
2491
2492                 try:
2493                         config = json.loads(configJSON)
2494                 except (ValueError,), err:
2495                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2496                         return
2497
2498                 playlist = config['playlist']
2499                 videoUrl = playlist[1]['url']
2500
2501                 info = {
2502                         'id': videoId,
2503                         'url': videoUrl,
2504                         'uploader': showName,
2505                         'upload_date': None,
2506                         'title': showName,
2507                         'ext': 'flv',
2508                         'format': 'flv',
2509                         'thumbnail': imgUrl,
2510                         'description': description,
2511                         'player_url': playerUrl,
2512                 }
2513
2514                 return [info]
2515
2516
2517 class CollegeHumorIE(InfoExtractor):
2518         """Information extractor for collegehumor.com"""
2519
2520         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2521         IE_NAME = u'collegehumor'
2522
2523         def report_webpage(self, video_id):
2524                 """Report information extraction."""
2525                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2526
2527         def report_extraction(self, video_id):
2528                 """Report information extraction."""
2529                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2530
2531         def _real_extract(self, url):
2532                 mobj = re.match(self._VALID_URL, url)
2533                 if mobj is None:
2534                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2535                         return
2536                 video_id = mobj.group('videoid')
2537
2538                 self.report_webpage(video_id)
2539                 request = urllib2.Request(url)
2540                 try:
2541                         webpage = urllib2.urlopen(request).read()
2542                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2543                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2544                         return
2545
2546                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2547                 if m is None:
2548                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2549                         return
2550                 internal_video_id = m.group('internalvideoid')
2551
2552                 info = {
2553                         'id': video_id,
2554                         'internal_id': internal_video_id,
2555                 }
2556
2557                 self.report_extraction(video_id)
2558                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2559                 try:
2560                         metaXml = urllib2.urlopen(xmlUrl).read()
2561                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2563                         return
2564
2565                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2566                 try:
2567                         videoNode = mdoc.findall('./video')[0]
2568                         info['description'] = videoNode.findall('./description')[0].text
2569                         info['title'] = videoNode.findall('./caption')[0].text
2570                         info['url'] = videoNode.findall('./file')[0].text
2571                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2572                         info['ext'] = info['url'].rpartition('.')[2]
2573                         info['format'] = info['ext']
2574                 except IndexError:
2575                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2576                         return
2577
2578                 return [info]
2579
2580
2581 class XVideosIE(InfoExtractor):
2582         """Information extractor for xvideos.com"""
2583
2584         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2585         IE_NAME = u'xvideos'
2586
2587         def report_webpage(self, video_id):
2588                 """Report information extraction."""
2589                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2590
2591         def report_extraction(self, video_id):
2592                 """Report information extraction."""
2593                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2594
2595         def _real_extract(self, url):
2596                 mobj = re.match(self._VALID_URL, url)
2597                 if mobj is None:
2598                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2599                         return
2600                 video_id = mobj.group(1).decode('utf-8')
2601
2602                 self.report_webpage(video_id)
2603
2604                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2605                 try:
2606                         webpage = urllib2.urlopen(request).read()
2607                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2608                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2609                         return
2610
2611                 self.report_extraction(video_id)
2612
2613
2614                 # Extract video URL
2615                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2616                 if mobj is None:
2617                         self._downloader.trouble(u'ERROR: unable to extract video url')
2618                         return
2619                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2620
2621
2622                 # Extract title
2623                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2624                 if mobj is None:
2625                         self._downloader.trouble(u'ERROR: unable to extract video title')
2626                         return
2627                 video_title = mobj.group(1).decode('utf-8')
2628
2629
2630                 # Extract video thumbnail
2631                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2632                 if mobj is None:
2633                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2634                         return
2635                 video_thumbnail = mobj.group(0).decode('utf-8')
2636
2637                 info = {
2638                         'id': video_id,
2639                         'url': video_url,
2640                         'uploader': None,
2641                         'upload_date': None,
2642                         'title': video_title,
2643                         'ext': 'flv',
2644                         'format': 'flv',
2645                         'thumbnail': video_thumbnail,
2646                         'description': None,
2647                         'player_url': None,
2648                 }
2649
2650                 return [info]
2651
2652
2653 class SoundcloudIE(InfoExtractor):
2654         """Information extractor for soundcloud.com
2655            To access the media, the uid of the song and a stream token
2656            must be extracted from the page source and the script must make
2657            a request to media.soundcloud.com/crossdomain.xml. Then
2658            the media can be grabbed by requesting from an url composed
2659            of the stream token and uid
2660          """
2661
2662         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2663         IE_NAME = u'soundcloud'
2664
2665         def __init__(self, downloader=None):
2666                 InfoExtractor.__init__(self, downloader)
2667
2668         def report_webpage(self, video_id):
2669                 """Report information extraction."""
2670                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2671
2672         def report_extraction(self, video_id):
2673                 """Report information extraction."""
2674                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2675
2676         def _real_extract(self, url):
2677                 mobj = re.match(self._VALID_URL, url)
2678                 if mobj is None:
2679                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2680                         return
2681
2682                 # extract uploader (which is in the url)
2683                 uploader = mobj.group(1).decode('utf-8')
2684                 # extract simple title (uploader + slug of song title)
2685                 slug_title =  mobj.group(2).decode('utf-8')
2686                 simple_title = uploader + u'-' + slug_title
2687
2688                 self.report_webpage('%s/%s' % (uploader, slug_title))
2689
2690                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2691                 try:
2692                         webpage = urllib2.urlopen(request).read()
2693                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2695                         return
2696
2697                 self.report_extraction('%s/%s' % (uploader, slug_title))
2698
2699                 # extract uid and stream token that soundcloud hands out for access
2700                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2701                 if mobj:
2702                         video_id = mobj.group(1)
2703                         stream_token = mobj.group(2)
2704
2705                 # extract unsimplified title
2706                 mobj = re.search('"title":"(.*?)",', webpage)
2707                 if mobj:
2708                         title = mobj.group(1).decode('utf-8')
2709                 else:
2710                         title = simple_title
2711
2712                 # construct media url (with uid/token)
2713                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2714                 mediaURL = mediaURL % (video_id, stream_token)
2715
2716                 # description
2717                 description = u'No description available'
2718                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2719                 if mobj:
2720                         description = mobj.group(1)
2721
2722                 # upload date
2723                 upload_date = None
2724                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2725                 if mobj:
2726                         try:
2727                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2728                         except Exception, e:
2729                                 self._downloader.to_stderr(compat_str(e))
2730
2731                 # for soundcloud, a request to a cross domain is required for cookies
2732                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2733
2734                 return [{
2735                         'id':           video_id.decode('utf-8'),
2736                         'url':          mediaURL,
2737                         'uploader':     uploader.decode('utf-8'),
2738                         'upload_date':  upload_date,
2739                         'title':        title,
2740                         'ext':          u'mp3',
2741                         'format':       u'NA',
2742                         'player_url':   None,
2743                         'description': description.decode('utf-8')
2744                 }]
2745
2746
2747 class InfoQIE(InfoExtractor):
2748         """Information extractor for infoq.com"""
2749
2750         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2751         IE_NAME = u'infoq'
2752
2753         def report_webpage(self, video_id):
2754                 """Report information extraction."""
2755                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2756
2757         def report_extraction(self, video_id):
2758                 """Report information extraction."""
2759                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2760
2761         def _real_extract(self, url):
2762                 mobj = re.match(self._VALID_URL, url)
2763                 if mobj is None:
2764                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2765                         return
2766
2767                 self.report_webpage(url)
2768
2769                 request = urllib2.Request(url)
2770                 try:
2771                         webpage = urllib2.urlopen(request).read()
2772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2773                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2774                         return
2775
2776                 self.report_extraction(url)
2777
2778
2779                 # Extract video URL
2780                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2781                 if mobj is None:
2782                         self._downloader.trouble(u'ERROR: unable to extract video url')
2783                         return
2784                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2785
2786
2787                 # Extract title
2788                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2789                 if mobj is None:
2790                         self._downloader.trouble(u'ERROR: unable to extract video title')
2791                         return
2792                 video_title = mobj.group(1).decode('utf-8')
2793
2794                 # Extract description
2795                 video_description = u'No description available.'
2796                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2797                 if mobj is not None:
2798                         video_description = mobj.group(1).decode('utf-8')
2799
2800                 video_filename = video_url.split('/')[-1]
2801                 video_id, extension = video_filename.split('.')
2802
2803                 info = {
2804                         'id': video_id,
2805                         'url': video_url,
2806                         'uploader': None,
2807                         'upload_date': None,
2808                         'title': video_title,
2809                         'ext': extension,
2810                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2811                         'thumbnail': None,
2812                         'description': video_description,
2813                         'player_url': None,
2814                 }
2815
2816                 return [info]
2817
2818 class MixcloudIE(InfoExtractor):
2819         """Information extractor for www.mixcloud.com"""
2820         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2821         IE_NAME = u'mixcloud'
2822
2823         def __init__(self, downloader=None):
2824                 InfoExtractor.__init__(self, downloader)
2825
2826         def report_download_json(self, file_id):
2827                 """Report JSON download."""
2828                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2829
2830         def report_extraction(self, file_id):
2831                 """Report information extraction."""
2832                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2833
2834         def get_urls(self, jsonData, fmt, bitrate='best'):
2835                 """Get urls from 'audio_formats' section in json"""
2836                 file_url = None
2837                 try:
2838                         bitrate_list = jsonData[fmt]
2839                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2840                                 bitrate = max(bitrate_list) # select highest
2841
2842                         url_list = jsonData[fmt][bitrate]
2843                 except TypeError: # we have no bitrate info.
2844                         url_list = jsonData[fmt]
2845                 return url_list
2846
2847         def check_urls(self, url_list):
2848                 """Returns 1st active url from list"""
2849                 for url in url_list:
2850                         try:
2851                                 urllib2.urlopen(url)
2852                                 return url
2853                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2854                                 url = None
2855
2856                 return None
2857
2858         def _print_formats(self, formats):
2859                 print('Available formats:')
2860                 for fmt in formats.keys():
2861                         for b in formats[fmt]:
2862                                 try:
2863                                         ext = formats[fmt][b][0]
2864                                         print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2865                                 except TypeError: # we have no bitrate info
2866                                         ext = formats[fmt][0]
2867                                         print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2868                                         break
2869
2870         def _real_extract(self, url):
2871                 mobj = re.match(self._VALID_URL, url)
2872                 if mobj is None:
2873                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2874                         return
2875                 # extract uploader & filename from url
2876                 uploader = mobj.group(1).decode('utf-8')
2877                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2878
2879                 # construct API request
2880                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2881                 # retrieve .json file with links to files
2882                 request = urllib2.Request(file_url)
2883                 try:
2884                         self.report_download_json(file_url)
2885                         jsonData = urllib2.urlopen(request).read()
2886                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2887                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2888                         return
2889
2890                 # parse JSON
2891                 json_data = json.loads(jsonData)
2892                 player_url = json_data['player_swf_url']
2893                 formats = dict(json_data['audio_formats'])
2894
2895                 req_format = self._downloader.params.get('format', None)
2896                 bitrate = None
2897
2898                 if self._downloader.params.get('listformats', None):
2899                         self._print_formats(formats)
2900                         return
2901
2902                 if req_format is None or req_format == 'best':
2903                         for format_param in formats.keys():
2904                                 url_list = self.get_urls(formats, format_param)
2905                                 # check urls
2906                                 file_url = self.check_urls(url_list)
2907                                 if file_url is not None:
2908                                         break # got it!
2909                 else:
2910                         if req_format not in formats.keys():
2911                                 self._downloader.trouble(u'ERROR: format is not available')
2912                                 return
2913
2914                         url_list = self.get_urls(formats, req_format)
2915                         file_url = self.check_urls(url_list)
2916                         format_param = req_format
2917
2918                 return [{
2919                         'id': file_id.decode('utf-8'),
2920                         'url': file_url.decode('utf-8'),
2921                         'uploader':     uploader.decode('utf-8'),
2922                         'upload_date': u'NA',
2923                         'title': json_data['name'],
2924                         'ext': file_url.split('.')[-1].decode('utf-8'),
2925                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2926                         'thumbnail': json_data['thumbnail_url'],
2927                         'description': json_data['description'],
2928                         'player_url': player_url.decode('utf-8'),
2929                 }]
2930
2931 class StanfordOpenClassroomIE(InfoExtractor):
2932         """Information extractor for Stanford's Open ClassRoom"""
2933
2934         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2935         IE_NAME = u'stanfordoc'
2936
2937         def report_download_webpage(self, objid):
2938                 """Report information extraction."""
2939                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2940
2941         def report_extraction(self, video_id):
2942                 """Report information extraction."""
2943                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2944
2945         def _real_extract(self, url):
2946                 mobj = re.match(self._VALID_URL, url)
2947                 if mobj is None:
2948                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2949                         return
2950
2951                 if mobj.group('course') and mobj.group('video'): # A specific video
2952                         course = mobj.group('course')
2953                         video = mobj.group('video')
2954                         info = {
2955                                 'id': course + '_' + video,
2956                         }
2957
2958                         self.report_extraction(info['id'])
2959                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2960                         xmlUrl = baseUrl + video + '.xml'
2961                         try:
2962                                 metaXml = urllib2.urlopen(xmlUrl).read()
2963                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2964                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2965                                 return
2966                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2967                         try:
2968                                 info['title'] = mdoc.findall('./title')[0].text
2969                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2970                         except IndexError:
2971                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2972                                 return
2973                         info['ext'] = info['url'].rpartition('.')[2]
2974                         info['format'] = info['ext']
2975                         return [info]
2976                 elif mobj.group('course'): # A course page
2977                         course = mobj.group('course')
2978                         info = {
2979                                 'id': course,
2980                                 'type': 'playlist',
2981                         }
2982
2983                         self.report_download_webpage(info['id'])
2984                         try:
2985                                 coursepage = urllib2.urlopen(url).read()
2986                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2987                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2988                                 return
2989
2990                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2991                         if m:
2992                                 info['title'] = unescapeHTML(m.group(1))
2993                         else:
2994                                 info['title'] = info['id']
2995
2996                         m = re.search('<description>([^<]+)</description>', coursepage)
2997                         if m:
2998                                 info['description'] = unescapeHTML(m.group(1))
2999
3000                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3001                         info['list'] = [
3002                                 {
3003                                         'type': 'reference',
3004                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3005                                 }
3006                                         for vpage in links]
3007                         results = []
3008                         for entry in info['list']:
3009                                 assert entry['type'] == 'reference'
3010                                 results += self.extract(entry['url'])
3011                         return results
3012
3013                 else: # Root page
3014                         info = {
3015                                 'id': 'Stanford OpenClassroom',
3016                                 'type': 'playlist',
3017                         }
3018
3019                         self.report_download_webpage(info['id'])
3020                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3021                         try:
3022                                 rootpage = urllib2.urlopen(rootURL).read()
3023                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3024                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3025                                 return
3026
3027                         info['title'] = info['id']
3028
3029                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3030                         info['list'] = [
3031                                 {
3032                                         'type': 'reference',
3033                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3034                                 }
3035                                         for cpage in links]
3036
3037                         results = []
3038                         for entry in info['list']:
3039                                 assert entry['type'] == 'reference'
3040                                 results += self.extract(entry['url'])
3041                         return results
3042
3043 class MTVIE(InfoExtractor):
3044         """Information extractor for MTV.com"""
3045
3046         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3047         IE_NAME = u'mtv'
3048
3049         def report_webpage(self, video_id):
3050                 """Report information extraction."""
3051                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3052
3053         def report_extraction(self, video_id):
3054                 """Report information extraction."""
3055                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3056
3057         def _real_extract(self, url):
3058                 mobj = re.match(self._VALID_URL, url)
3059                 if mobj is None:
3060                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3061                         return
3062                 if not mobj.group('proto'):
3063                         url = 'http://' + url
3064                 video_id = mobj.group('videoid')
3065                 self.report_webpage(video_id)
3066
3067                 request = urllib2.Request(url)
3068                 try:
3069                         webpage = urllib2.urlopen(request).read()
3070                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3071                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3072                         return
3073
3074                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3075                 if mobj is None:
3076                         self._downloader.trouble(u'ERROR: unable to extract song name')
3077                         return
3078                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3079                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3080                 if mobj is None:
3081                         self._downloader.trouble(u'ERROR: unable to extract performer')
3082                         return
3083                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3084                 video_title = performer + ' - ' + song_name
3085
3086                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3087                 if mobj is None:
3088                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3089                         return
3090                 mtvn_uri = mobj.group(1)
3091
3092                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3093                 if mobj is None:
3094                         self._downloader.trouble(u'ERROR: unable to extract content id')
3095                         return
3096                 content_id = mobj.group(1)
3097
3098                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3099                 self.report_extraction(video_id)
3100                 request = urllib2.Request(videogen_url)
3101                 try:
3102                         metadataXml = urllib2.urlopen(request).read()
3103                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3104                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3105                         return
3106
3107                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3108                 renditions = mdoc.findall('.//rendition')
3109
3110                 # For now, always pick the highest quality.
3111                 rendition = renditions[-1]
3112
3113                 try:
3114                         _,_,ext = rendition.attrib['type'].partition('/')
3115                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3116                         video_url = rendition.find('./src').text
3117                 except KeyError:
3118                         self._downloader.trouble('Invalid rendition field.')
3119                         return
3120
3121                 info = {
3122                         'id': video_id,
3123                         'url': video_url,
3124                         'uploader': performer,
3125                         'title': video_title,
3126                         'ext': ext,
3127                         'format': format,
3128                 }
3129
3130                 return [info]
3131
3132
3133 class YoukuIE(InfoExtractor):
3134
3135         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3136         IE_NAME = u'Youku'
3137
3138         def __init__(self, downloader=None):
3139                 InfoExtractor.__init__(self, downloader)
3140
3141         def report_download_webpage(self, file_id):
3142                 """Report webpage download."""
3143                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3144
3145         def report_extraction(self, file_id):
3146                 """Report information extraction."""
3147                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3148
3149         def _gen_sid(self):
3150                 nowTime = int(time.time() * 1000)
3151                 random1 = random.randint(1000,1998)
3152                 random2 = random.randint(1000,9999)
3153
3154                 return "%d%d%d" %(nowTime,random1,random2)
3155
3156         def _get_file_ID_mix_string(self, seed):
3157                 mixed = []
3158                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3159                 seed = float(seed)
3160                 for i in range(len(source)):
3161                         seed  =  (seed * 211 + 30031 ) % 65536
3162                         index  =  math.floor(seed / 65536 * len(source) )
3163                         mixed.append(source[int(index)])
3164                         source.remove(source[int(index)])
3165                 #return ''.join(mixed)
3166                 return mixed
3167
3168         def _get_file_id(self, fileId, seed):
3169                 mixed = self._get_file_ID_mix_string(seed)
3170                 ids = fileId.split('*')
3171                 realId = []
3172                 for ch in ids:
3173                         if ch:
3174                                 realId.append(mixed[int(ch)])
3175                 return ''.join(realId)
3176
3177         def _real_extract(self, url):
3178                 mobj = re.match(self._VALID_URL, url)
3179                 if mobj is None:
3180                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3181                         return
3182                 video_id = mobj.group('ID')
3183
3184                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3185
3186                 request = urllib2.Request(info_url, None, std_headers)
3187                 try:
3188                         self.report_download_webpage(video_id)
3189                         jsondata = urllib2.urlopen(request).read()
3190                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3191                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3192                         return
3193
3194                 self.report_extraction(video_id)
3195                 try:
3196                         config = json.loads(jsondata)
3197
3198                         video_title =  config['data'][0]['title']
3199                         seed = config['data'][0]['seed']
3200
3201                         format = self._downloader.params.get('format', None)
3202                         supported_format = config['data'][0]['streamfileids'].keys()
3203
3204                         if format is None or format == 'best':
3205                                 if 'hd2' in supported_format:
3206                                         format = 'hd2'
3207                                 else:
3208                                         format = 'flv'
3209                                 ext = u'flv'
3210                         elif format == 'worst':
3211                                 format = 'mp4'
3212                                 ext = u'mp4'
3213                         else:
3214                                 format = 'flv'
3215                                 ext = u'flv'
3216
3217
3218                         fileid = config['data'][0]['streamfileids'][format]
3219                         seg_number = len(config['data'][0]['segs'][format])
3220
3221                         keys=[]
3222                         for i in xrange(seg_number):
3223                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3224
3225                         #TODO check error
3226                         #youku only could be viewed from mainland china
3227                 except:
3228                         self._downloader.trouble(u'ERROR: unable to extract info section')
3229                         return
3230
3231                 files_info=[]
3232                 sid = self._gen_sid()
3233                 fileid = self._get_file_id(fileid, seed)
3234
3235                 #column 8,9 of fileid represent the segment number
3236                 #fileid[7:9] should be changed
3237                 for index, key in enumerate(keys):
3238
3239                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3240                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3241
3242                         info = {
3243                                 'id': '%s_part%02d' % (video_id, index),
3244                                 'url': download_url,
3245                                 'uploader': None,
3246                                 'title': video_title,
3247                                 'ext': ext,
3248                                 'format': u'NA'
3249                         }
3250                         files_info.append(info)
3251
3252                 return files_info
3253
3254
3255 class XNXXIE(InfoExtractor):
3256         """Information extractor for xnxx.com"""
3257
3258         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3259         IE_NAME = u'xnxx'
3260         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3261         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3262         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3263
3264         def report_webpage(self, video_id):
3265                 """Report information extraction"""
3266                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3267
3268         def report_extraction(self, video_id):
3269                 """Report information extraction"""
3270                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3271
3272         def _real_extract(self, url):
3273                 mobj = re.match(self._VALID_URL, url)
3274                 if mobj is None:
3275                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3276                         return
3277                 video_id = mobj.group(1).decode('utf-8')
3278
3279                 self.report_webpage(video_id)
3280
3281                 # Get webpage content
3282                 try:
3283                         webpage = urllib2.urlopen(url).read()
3284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3285                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3286                         return
3287
3288                 result = re.search(self.VIDEO_URL_RE, webpage)
3289                 if result is None:
3290                         self._downloader.trouble(u'ERROR: unable to extract video url')
3291                         return
3292                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3293
3294                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3295                 if result is None:
3296                         self._downloader.trouble(u'ERROR: unable to extract video title')
3297                         return
3298                 video_title = result.group(1).decode('utf-8')
3299
3300                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3301                 if result is None:
3302                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3303                         return
3304                 video_thumbnail = result.group(1).decode('utf-8')
3305
3306                 info = {'id': video_id,
3307                                 'url': video_url,
3308                                 'uploader': None,
3309                                 'upload_date': None,
3310                                 'title': video_title,
3311                                 'ext': 'flv',
3312                                 'format': 'flv',
3313                                 'thumbnail': video_thumbnail,
3314                                 'description': None,
3315                                 'player_url': None}
3316
3317                 return [info]
3318
3319
3320 class GooglePlusIE(InfoExtractor):
3321         """Information extractor for plus.google.com."""
3322
3323         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3324         IE_NAME = u'plus.google'
3325
3326         def __init__(self, downloader=None):
3327                 InfoExtractor.__init__(self, downloader)
3328
3329         def report_extract_entry(self, url):
3330                 """Report downloading extry"""
3331                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3332
3333         def report_date(self, upload_date):
3334                 """Report downloading extry"""
3335                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3336
3337         def report_uploader(self, uploader):
3338                 """Report downloading extry"""
3339                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3340
3341         def report_title(self, video_title):
3342                 """Report downloading extry"""
3343                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3344
3345         def report_extract_vid_page(self, video_page):
3346                 """Report information extraction."""
3347                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3348
3349         def _real_extract(self, url):
3350                 # Extract id from URL
3351                 mobj = re.match(self._VALID_URL, url)
3352                 if mobj is None:
3353                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3354                         return
3355
3356                 post_url = mobj.group(0)
3357                 video_id = mobj.group(2)
3358
3359                 video_extension = 'flv'
3360
3361                 # Step 1, Retrieve post webpage to extract further information
3362                 self.report_extract_entry(post_url)
3363                 request = urllib2.Request(post_url)
3364                 try:
3365                         webpage = urllib2.urlopen(request).read()
3366                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3367                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3368                         return
3369
3370                 # Extract update date
3371                 upload_date = u'NA'
3372                 pattern = 'title="Timestamp">(.*?)</a>'
3373                 mobj = re.search(pattern, webpage)
3374                 if mobj:
3375                         upload_date = mobj.group(1)
3376                         # Convert timestring to a format suitable for filename
3377                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3378                         upload_date = upload_date.strftime('%Y%m%d')
3379                 self.report_date(upload_date)
3380
3381                 # Extract uploader
3382                 uploader = u'NA'
3383                 pattern = r'rel\="author".*?>(.*?)</a>'
3384                 mobj = re.search(pattern, webpage)
3385                 if mobj:
3386                         uploader = mobj.group(1)
3387                 self.report_uploader(uploader)
3388
3389                 # Extract title
3390                 # Get the first line for title
3391                 video_title = u'NA'
3392                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3393                 mobj = re.search(pattern, webpage)
3394                 if mobj:
3395                         video_title = mobj.group(1)
3396                 self.report_title(video_title)
3397
3398                 # Step 2, Stimulate clicking the image box to launch video
3399                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3400                 mobj = re.search(pattern, webpage)
3401                 if mobj is None:
3402                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3403
3404                 video_page = mobj.group(1)
3405                 request = urllib2.Request(video_page)
3406                 try:
3407                         webpage = urllib2.urlopen(request).read()
3408                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3409                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3410                         return
3411                 self.report_extract_vid_page(video_page)
3412
3413
3414                 # Extract video links on video page
3415                 """Extract video links of all sizes"""
3416                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3417                 mobj = re.findall(pattern, webpage)
3418                 if len(mobj) == 0:
3419                         self._downloader.trouble(u'ERROR: unable to extract video links')
3420
3421                 # Sort in resolution
3422                 links = sorted(mobj)
3423
3424                 # Choose the lowest of the sort, i.e. highest resolution
3425                 video_url = links[-1]
3426                 # Only get the url. The resolution part in the tuple has no use anymore
3427                 video_url = video_url[-1]
3428                 # Treat escaped \u0026 style hex
3429                 video_url = unicode(video_url, "unicode_escape")
3430
3431
3432                 return [{
3433                         'id':           video_id.decode('utf-8'),
3434                         'url':          video_url,
3435                         'uploader':     uploader.decode('utf-8'),
3436                         'upload_date':  upload_date.decode('utf-8'),
3437                         'title':        video_title.decode('utf-8'),
3438                         'ext':          video_extension.decode('utf-8'),
3439                         'format':       u'NA',
3440                         'player_url':   None,
3441                 }]