_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         ext:            Video filename extension.
  43         format:         Video format.
  44         player_url:     SWF Player URL (may be None).
  45
  46         The following fields are optional. Their primary purpose is to allow
  47         youtube-dl to serve as the backend for a video search function, such
  48         as the one in youtube2mp3.  They are only used when their respective
  49         forced printing functions are called:
  50
  51         thumbnail:      Full URL to a video thumbnail image.
  52         description:    One-line video description.
  53
  54         Subclasses of this one should re-define the _real_initialize() and
  55         _real_extract() methods and define a _VALID_URL regexp.
  56         Probably, they should also be added to the list of extractors.
  57         """
  58
  59         _ready = False
  60         _downloader = None
  61
  62         def __init__(self, downloader=None):
  63                 """Constructor. Receives an optional downloader."""
  64                 self._ready = False
  65                 self.set_downloader(downloader)
  66
  67         def suitable(self, url):
  68                 """Receives a URL and returns True if suitable for this IE."""
  69                 return re.match(self._VALID_URL, url) is not None
  70
  71         def initialize(self):
  72                 """Initializes an instance (authentication, etc)."""
  73                 if not self._ready:
  74                         self._real_initialize()
  75                         self._ready = True
  76
  77         def extract(self, url):
  78                 """Extracts URL information and returns it in list of dicts."""
  79                 self.initialize()
  80                 return self._real_extract(url)
  81
  82         def set_downloader(self, downloader):
  83                 """Sets the downloader for this IE."""
  84                 self._downloader = downloader
  85
  86         def _real_initialize(self):
  87                 """Real initialization process. Redefine in subclasses."""
  88                 pass
  89
  90         def _real_extract(self, url):
  91                 """Real extraction process. Redefine in subclasses."""
  92                 pass
  93
  94
  95 class YoutubeIE(InfoExtractor):
  96         """Information extractor for youtube.com."""
  97
  98         _VALID_URL = r"""^
  99                          (
 100                              (?:https?://)?                                       # http(s):// (optional)
 101                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/) # the various hostnames, with wildcard subdomains
 102                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 103                              (?:                                                  # the various things that can precede the ID:
 104                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 105                                  |(?:                                             # or the v= param in all its forms
 106                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 107                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 108                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 109                                      v=
 110                                  )
 111                              )?                                                   # optional -> youtube.com/xxxx is OK
 112                          )?                                                       # all until now is optional -> you can pass the naked ID
 113                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 114                          (?(1).+)?                                                # if we found the ID, everything can follow
 115                          $"""
 116         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 117         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 118         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 119         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 120         _NETRC_MACHINE = 'youtube'
 121         # Listed in order of quality
 122         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 123         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 124         _video_extensions = {
 125                 '13': '3gp',
 126                 '17': 'mp4',
 127                 '18': 'mp4',
 128                 '22': 'mp4',
 129                 '37': 'mp4',
 130                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 131                 '43': 'webm',
 132                 '44': 'webm',
 133                 '45': 'webm',
 134                 '46': 'webm',
 135         }
 136         _video_dimensions = {
 137                 '5': '240x400',
 138                 '6': '???',
 139                 '13': '???',
 140                 '17': '144x176',
 141                 '18': '360x640',
 142                 '22': '720x1280',
 143                 '34': '360x640',
 144                 '35': '480x854',
 145                 '37': '1080x1920',
 146                 '38': '3072x4096',
 147                 '43': '360x640',
 148                 '44': '480x854',
 149                 '45': '720x1280',
 150                 '46': '1080x1920',
 151         }
 152         IE_NAME = u'youtube'
 153
 154         def suitable(self, url):
 155                 """Receives a URL and returns True if suitable for this IE."""
 156                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 157
 158         def report_lang(self):
 159                 """Report attempt to set language."""
 160                 self._downloader.to_screen(u'[youtube] Setting language')
 161
 162         def report_login(self):
 163                 """Report attempt to log in."""
 164                 self._downloader.to_screen(u'[youtube] Logging in')
 165
 166         def report_age_confirmation(self):
 167                 """Report attempt to confirm age."""
 168                 self._downloader.to_screen(u'[youtube] Confirming age')
 169
 170         def report_video_webpage_download(self, video_id):
 171                 """Report attempt to download video webpage."""
 172                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 173
 174         def report_video_info_webpage_download(self, video_id):
 175                 """Report attempt to download video info webpage."""
 176                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 177
 178         def report_video_subtitles_download(self, video_id):
 179                 """Report attempt to download video info webpage."""
 180                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 181
 182         def report_information_extraction(self, video_id):
 183                 """Report attempt to extract video information."""
 184                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 185
 186         def report_unavailable_format(self, video_id, format):
 187                 """Report extracted video URL."""
 188                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 189
 190         def report_rtmp_download(self):
 191                 """Indicate the download will use the RTMP protocol."""
 192                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 193
 194         def _closed_captions_xml_to_srt(self, xml_string):
 195                 srt = ''
 196                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 197                 # TODO parse xml instead of regex
 198                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 199                         if not dur: dur = '4'
 200                         start = float(start)
 201                         end = start + float(dur)
 202                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 203                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 204                         caption = unescapeHTML(caption)
 205                         caption = unescapeHTML(caption) # double cycle, intentional
 206                         srt += str(n+1) + '\n'
 207                         srt += start + ' --> ' + end + '\n'
 208                         srt += caption + '\n\n'
 209                 return srt
 210
 211         def _print_formats(self, formats):
 212                 print 'Available formats:'
 213                 for x in formats:
 214                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 215
 216         def _real_initialize(self):
 217                 if self._downloader is None:
 218                         return
 219
 220                 username = None
 221                 password = None
 222                 downloader_params = self._downloader.params
 223
 224                 # Attempt to use provided username and password or .netrc data
 225                 if downloader_params.get('username', None) is not None:
 226                         username = downloader_params['username']
 227                         password = downloader_params['password']
 228                 elif downloader_params.get('usenetrc', False):
 229                         try:
 230                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 231                                 if info is not None:
 232                                         username = info[0]
 233                                         password = info[2]
 234                                 else:
 235                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 236                         except (IOError, netrc.NetrcParseError), err:
 237                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 238                                 return
 239
 240                 # Set language
 241                 request = urllib2.Request(self._LANG_URL)
 242                 try:
 243                         self.report_lang()
 244                         urllib2.urlopen(request).read()
 245                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 246                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 247                         return
 248
 249                 # No authentication to be performed
 250                 if username is None:
 251                         return
 252
 253                 # Log in
 254                 login_form = {
 255                                 'current_form': 'loginForm',
 256                                 'next':         '/',
 257                                 'action_login': 'Log In',
 258                                 'username':     username,
 259                                 'password':     password,
 260                                 }
 261                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 262                 try:
 263                         self.report_login()
 264                         login_results = urllib2.urlopen(request).read()
 265                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 266                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 267                                 return
 268                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 269                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 270                         return
 271
 272                 # Confirm age
 273                 age_form = {
 274                                 'next_url':             '/',
 275                                 'action_confirm':       'Confirm',
 276                                 }
 277                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 278                 try:
 279                         self.report_age_confirmation()
 280                         age_results = urllib2.urlopen(request).read()
 281                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 282                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 283                         return
 284
 285         def _real_extract(self, url):
 286                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 287                 mobj = re.search(self._NEXT_URL_RE, url)
 288                 if mobj:
 289                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 290
 291                 # Extract video id from URL
 292                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 293                 if mobj is None:
 294                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 295                         return
 296                 video_id = mobj.group(2)
 297
 298                 # Get video webpage
 299                 self.report_video_webpage_download(video_id)
 300                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 301                 try:
 302                         video_webpage = urllib2.urlopen(request).read()
 303                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 304                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 305                         return
 306
 307                 # Attempt to extract SWF player URL
 308                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 309                 if mobj is not None:
 310                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 311                 else:
 312                         player_url = None
 313
 314                 # Get video info
 315                 self.report_video_info_webpage_download(video_id)
 316                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 317                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 318                                         % (video_id, el_type))
 319                         request = urllib2.Request(video_info_url)
 320                         try:
 321                                 video_info_webpage = urllib2.urlopen(request).read()
 322                                 video_info = parse_qs(video_info_webpage)
 323                                 if 'token' in video_info:
 324                                         break
 325                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 326                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 327                                 return
 328                 if 'token' not in video_info:
 329                         if 'reason' in video_info:
 330                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 331                         else:
 332                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 333                         return
 334
 335                 # Check for "rental" videos
 336                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 337                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 338                         return
 339
 340                 # Start extracting information
 341                 self.report_information_extraction(video_id)
 342
 343                 # uploader
 344                 if 'author' not in video_info:
 345                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 346                         return
 347                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 348
 349                 # title
 350                 if 'title' not in video_info:
 351                         self._downloader.trouble(u'ERROR: unable to extract video title')
 352                         return
 353                 video_title = urllib.unquote_plus(video_info['title'][0])
 354                 video_title = video_title.decode('utf-8')
 355
 356                 # thumbnail image
 357                 if 'thumbnail_url' not in video_info:
 358                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 359                         video_thumbnail = ''
 360                 else:   # don't panic if we can't find it
 361                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 362
 363                 # upload date
 364                 upload_date = u'NA'
 365                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 366                 if mobj is not None:
 367                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 368                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 369                         for expression in format_expressions:
 370                                 try:
 371                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 372                                 except:
 373                                         pass
 374
 375                 # description
 376                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 377                 if video_description: video_description = clean_html(video_description)
 378                 else: video_description = ''
 379
 380                 # closed captions
 381                 video_subtitles = None
 382                 if self._downloader.params.get('writesubtitles', False):
 383                         try:
 384                                 self.report_video_subtitles_download(video_id)
 385                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 386                                 try:
 387                                         srt_list = urllib2.urlopen(request).read()
 388                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 389                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 390                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 391                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 392                                 if not srt_lang_list:
 393                                         raise Trouble(u'WARNING: video has no closed captions')
 394                                 if self._downloader.params.get('subtitleslang', False):
 395                                         srt_lang = self._downloader.params.get('subtitleslang')
 396                                 elif 'en' in srt_lang_list:
 397                                         srt_lang = 'en'
 398                                 else:
 399                                         srt_lang = srt_lang_list.keys()[0]
 400                                 if not srt_lang in srt_lang_list:
 401                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 402                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 403                                 try:
 404                                         srt_xml = urllib2.urlopen(request).read()
 405                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 406                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 407                                 if not srt_xml:
 408                                         raise Trouble(u'WARNING: unable to download video subtitles')
 409                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 410                         except Trouble as trouble:
 411                                 self._downloader.trouble(trouble[0])
 412
 413                 # token
 414                 video_token = urllib.unquote_plus(video_info['token'][0])
 415
 416                 # Decide which formats to download
 417                 req_format = self._downloader.params.get('format', None)
 418
 419                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 420                         self.report_rtmp_download()
 421                         video_url_list = [(None, video_info['conn'][0])]
 422                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 423                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 424                         url_data = [parse_qs(uds) for uds in url_data_strs]
 425                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 426                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 427
 428                         format_limit = self._downloader.params.get('format_limit', None)
 429                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 430                         if format_limit is not None and format_limit in available_formats:
 431                                 format_list = available_formats[available_formats.index(format_limit):]
 432                         else:
 433                                 format_list = available_formats
 434                         existing_formats = [x for x in format_list if x in url_map]
 435                         if len(existing_formats) == 0:
 436                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 437                                 return
 438                         if self._downloader.params.get('listformats', None):
 439                                 self._print_formats(existing_formats)
 440                                 return
 441                         if req_format is None or req_format == 'best':
 442                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 443                         elif req_format == 'worst':
 444                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 445                         elif req_format in ('-1', 'all'):
 446                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 447                         else:
 448                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 449                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 450                                 req_formats = req_format.split('/')
 451                                 video_url_list = None
 452                                 for rf in req_formats:
 453                                         if rf in url_map:
 454                                                 video_url_list = [(rf, url_map[rf])]
 455                                                 break
 456                                 if video_url_list is None:
 457                                         self._downloader.trouble(u'ERROR: requested format not available')
 458                                         return
 459                 else:
 460                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 461                         return
 462
 463                 results = []
 464                 for format_param, video_real_url in video_url_list:
 465                         # Extension
 466                         video_extension = self._video_extensions.get(format_param, 'flv')
 467
 468                         results.append({
 469                                 'id':           video_id.decode('utf-8'),
 470                                 'url':          video_real_url.decode('utf-8'),
 471                                 'uploader':     video_uploader.decode('utf-8'),
 472                                 'upload_date':  upload_date,
 473                                 'title':        video_title,
 474                                 'ext':          video_extension.decode('utf-8'),
 475                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 476                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 477                                 'description':  video_description,
 478                                 'player_url':   player_url,
 479                                 'subtitles':    video_subtitles
 480                         })
 481                 return results
 482
 483
 484 class MetacafeIE(InfoExtractor):
 485         """Information Extractor for metacafe.com."""
 486
 487         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 488         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 489         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 490         IE_NAME = u'metacafe'
 491
 492         def __init__(self, downloader=None):
 493                 InfoExtractor.__init__(self, downloader)
 494
 495         def report_disclaimer(self):
 496                 """Report disclaimer retrieval."""
 497                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 498
 499         def report_age_confirmation(self):
 500                 """Report attempt to confirm age."""
 501                 self._downloader.to_screen(u'[metacafe] Confirming age')
 502
 503         def report_download_webpage(self, video_id):
 504                 """Report webpage download."""
 505                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 506
 507         def report_extraction(self, video_id):
 508                 """Report information extraction."""
 509                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 510
 511         def _real_initialize(self):
 512                 # Retrieve disclaimer
 513                 request = urllib2.Request(self._DISCLAIMER)
 514                 try:
 515                         self.report_disclaimer()
 516                         disclaimer = urllib2.urlopen(request).read()
 517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 518                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 519                         return
 520
 521                 # Confirm age
 522                 disclaimer_form = {
 523                         'filters': '0',
 524                         'submit': "Continue - I'm over 18",
 525                         }
 526                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 527                 try:
 528                         self.report_age_confirmation()
 529                         disclaimer = urllib2.urlopen(request).read()
 530                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 531                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 532                         return
 533
 534         def _real_extract(self, url):
 535                 # Extract id and simplified title from URL
 536                 mobj = re.match(self._VALID_URL, url)
 537                 if mobj is None:
 538                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 539                         return
 540
 541                 video_id = mobj.group(1)
 542
 543                 # Check if video comes from YouTube
 544                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 545                 if mobj2 is not None:
 546                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 547                         return
 548
 549                 # Retrieve video webpage to extract further information
 550                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 551                 try:
 552                         self.report_download_webpage(video_id)
 553                         webpage = urllib2.urlopen(request).read()
 554                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 555                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 556                         return
 557
 558                 # Extract URL, uploader and title from webpage
 559                 self.report_extraction(video_id)
 560                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 561                 if mobj is not None:
 562                         mediaURL = urllib.unquote(mobj.group(1))
 563                         video_extension = mediaURL[-3:]
 564
 565                         # Extract gdaKey if available
 566                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 567                         if mobj is None:
 568                                 video_url = mediaURL
 569                         else:
 570                                 gdaKey = mobj.group(1)
 571                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 572                 else:
 573                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 574                         if mobj is None:
 575                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 576                                 return
 577                         vardict = parse_qs(mobj.group(1))
 578                         if 'mediaData' not in vardict:
 579                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 580                                 return
 581                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 582                         if mobj is None:
 583                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 584                                 return
 585                         mediaURL = mobj.group(1).replace('\\/', '/')
 586                         video_extension = mediaURL[-3:]
 587                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 588
 589                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 590                 if mobj is None:
 591                         self._downloader.trouble(u'ERROR: unable to extract title')
 592                         return
 593                 video_title = mobj.group(1).decode('utf-8')
 594
 595                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 596                 if mobj is None:
 597                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 598                         return
 599                 video_uploader = mobj.group(1)
 600
 601                 return [{
 602                         'id':           video_id.decode('utf-8'),
 603                         'url':          video_url.decode('utf-8'),
 604                         'uploader':     video_uploader.decode('utf-8'),
 605                         'upload_date':  u'NA',
 606                         'title':        video_title,
 607                         'ext':          video_extension.decode('utf-8'),
 608                         'format':       u'NA',
 609                         'player_url':   None,
 610                 }]
 611
 612
 613 class DailymotionIE(InfoExtractor):
 614         """Information Extractor for Dailymotion"""
 615
 616         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 617         IE_NAME = u'dailymotion'
 618
 619         def __init__(self, downloader=None):
 620                 InfoExtractor.__init__(self, downloader)
 621
 622         def report_download_webpage(self, video_id):
 623                 """Report webpage download."""
 624                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 625
 626         def report_extraction(self, video_id):
 627                 """Report information extraction."""
 628                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 629
 630         def _real_extract(self, url):
 631                 # Extract id and simplified title from URL
 632                 mobj = re.match(self._VALID_URL, url)
 633                 if mobj is None:
 634                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 635                         return
 636
 637                 video_id = mobj.group(1)
 638
 639                 video_extension = 'flv'
 640
 641                 # Retrieve video webpage to extract further information
 642                 request = urllib2.Request(url)
 643                 request.add_header('Cookie', 'family_filter=off')
 644                 try:
 645                         self.report_download_webpage(video_id)
 646                         webpage = urllib2.urlopen(request).read()
 647                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 648                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 649                         return
 650
 651                 # Extract URL, uploader and title from webpage
 652                 self.report_extraction(video_id)
 653                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 654                 if mobj is None:
 655                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 656                         return
 657                 sequence = urllib.unquote(mobj.group(1))
 658                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 659                 if mobj is None:
 660                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 661                         return
 662                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 663
 664                 # if needed add http://www.dailymotion.com/ if relative URL
 665
 666                 video_url = mediaURL
 667
 668                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 669                 if mobj is None:
 670                         self._downloader.trouble(u'ERROR: unable to extract title')
 671                         return
 672                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 673
 674                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 675                 if mobj is None:
 676                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 677                         return
 678                 video_uploader = mobj.group(1)
 679
 680                 return [{
 681                         'id':           video_id.decode('utf-8'),
 682                         'url':          video_url.decode('utf-8'),
 683                         'uploader':     video_uploader.decode('utf-8'),
 684                         'upload_date':  u'NA',
 685                         'title':        video_title,
 686                         'ext':          video_extension.decode('utf-8'),
 687                         'format':       u'NA',
 688                         'player_url':   None,
 689                 }]
 690
 691
 692 class GoogleIE(InfoExtractor):
 693         """Information extractor for video.google.com."""
 694
 695         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 696         IE_NAME = u'video.google'
 697
 698         def __init__(self, downloader=None):
 699                 InfoExtractor.__init__(self, downloader)
 700
 701         def report_download_webpage(self, video_id):
 702                 """Report webpage download."""
 703                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 704
 705         def report_extraction(self, video_id):
 706                 """Report information extraction."""
 707                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 708
 709         def _real_extract(self, url):
 710                 # Extract id from URL
 711                 mobj = re.match(self._VALID_URL, url)
 712                 if mobj is None:
 713                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 714                         return
 715
 716                 video_id = mobj.group(1)
 717
 718                 video_extension = 'mp4'
 719
 720                 # Retrieve video webpage to extract further information
 721                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 722                 try:
 723                         self.report_download_webpage(video_id)
 724                         webpage = urllib2.urlopen(request).read()
 725                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 726                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 727                         return
 728
 729                 # Extract URL, uploader, and title from webpage
 730                 self.report_extraction(video_id)
 731                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 732                 if mobj is None:
 733                         video_extension = 'flv'
 734                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 735                 if mobj is None:
 736                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 737                         return
 738                 mediaURL = urllib.unquote(mobj.group(1))
 739                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 740                 mediaURL = mediaURL.replace('\\x26', '\x26')
 741
 742                 video_url = mediaURL
 743
 744                 mobj = re.search(r'<title>(.*)</title>', webpage)
 745                 if mobj is None:
 746                         self._downloader.trouble(u'ERROR: unable to extract title')
 747                         return
 748                 video_title = mobj.group(1).decode('utf-8')
 749
 750                 # Extract video description
 751                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 752                 if mobj is None:
 753                         self._downloader.trouble(u'ERROR: unable to extract video description')
 754                         return
 755                 video_description = mobj.group(1).decode('utf-8')
 756                 if not video_description:
 757                         video_description = 'No description available.'
 758
 759                 # Extract video thumbnail
 760                 if self._downloader.params.get('forcethumbnail', False):
 761                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 762                         try:
 763                                 webpage = urllib2.urlopen(request).read()
 764                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 765                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 766                                 return
 767                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 768                         if mobj is None:
 769                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 770                                 return
 771                         video_thumbnail = mobj.group(1)
 772                 else:   # we need something to pass to process_info
 773                         video_thumbnail = ''
 774
 775                 return [{
 776                         'id':           video_id.decode('utf-8'),
 777                         'url':          video_url.decode('utf-8'),
 778                         'uploader':     u'NA',
 779                         'upload_date':  u'NA',
 780                         'title':        video_title,
 781                         'ext':          video_extension.decode('utf-8'),
 782                         'format':       u'NA',
 783                         'player_url':   None,
 784                 }]
 785
 786
 787 class PhotobucketIE(InfoExtractor):
 788         """Information extractor for photobucket.com."""
 789
 790         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 791         IE_NAME = u'photobucket'
 792
 793         def __init__(self, downloader=None):
 794                 InfoExtractor.__init__(self, downloader)
 795
 796         def report_download_webpage(self, video_id):
 797                 """Report webpage download."""
 798                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 799
 800         def report_extraction(self, video_id):
 801                 """Report information extraction."""
 802                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 803
 804         def _real_extract(self, url):
 805                 # Extract id from URL
 806                 mobj = re.match(self._VALID_URL, url)
 807                 if mobj is None:
 808                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 809                         return
 810
 811                 video_id = mobj.group(1)
 812
 813                 video_extension = 'flv'
 814
 815                 # Retrieve video webpage to extract further information
 816                 request = urllib2.Request(url)
 817                 try:
 818                         self.report_download_webpage(video_id)
 819                         webpage = urllib2.urlopen(request).read()
 820                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 821                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 822                         return
 823
 824                 # Extract URL, uploader, and title from webpage
 825                 self.report_extraction(video_id)
 826                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 827                 if mobj is None:
 828                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 829                         return
 830                 mediaURL = urllib.unquote(mobj.group(1))
 831
 832                 video_url = mediaURL
 833
 834                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 835                 if mobj is None:
 836                         self._downloader.trouble(u'ERROR: unable to extract title')
 837                         return
 838                 video_title = mobj.group(1).decode('utf-8')
 839
 840                 video_uploader = mobj.group(2).decode('utf-8')
 841
 842                 return [{
 843                         'id':           video_id.decode('utf-8'),
 844                         'url':          video_url.decode('utf-8'),
 845                         'uploader':     video_uploader,
 846                         'upload_date':  u'NA',
 847                         'title':        video_title,
 848                         'ext':          video_extension.decode('utf-8'),
 849                         'format':       u'NA',
 850                         'player_url':   None,
 851                 }]
 852
 853
 854 class YahooIE(InfoExtractor):
 855         """Information extractor for video.yahoo.com."""
 856
 857         # _VALID_URL matches all Yahoo! Video URLs
 858         # _VPAGE_URL matches only the extractable '/watch/' URLs
 859         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 860         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 861         IE_NAME = u'video.yahoo'
 862
 863         def __init__(self, downloader=None):
 864                 InfoExtractor.__init__(self, downloader)
 865
 866         def report_download_webpage(self, video_id):
 867                 """Report webpage download."""
 868                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 869
 870         def report_extraction(self, video_id):
 871                 """Report information extraction."""
 872                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 873
 874         def _real_extract(self, url, new_video=True):
 875                 # Extract ID from URL
 876                 mobj = re.match(self._VALID_URL, url)
 877                 if mobj is None:
 878                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 879                         return
 880
 881                 video_id = mobj.group(2)
 882                 video_extension = 'flv'
 883
 884                 # Rewrite valid but non-extractable URLs as
 885                 # extractable English language /watch/ URLs
 886                 if re.match(self._VPAGE_URL, url) is None:
 887                         request = urllib2.Request(url)
 888                         try:
 889                                 webpage = urllib2.urlopen(request).read()
 890                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 891                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 892                                 return
 893
 894                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 895                         if mobj is None:
 896                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 897                                 return
 898                         yahoo_id = mobj.group(1)
 899
 900                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 901                         if mobj is None:
 902                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 903                                 return
 904                         yahoo_vid = mobj.group(1)
 905
 906                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 907                         return self._real_extract(url, new_video=False)
 908
 909                 # Retrieve video webpage to extract further information
 910                 request = urllib2.Request(url)
 911                 try:
 912                         self.report_download_webpage(video_id)
 913                         webpage = urllib2.urlopen(request).read()
 914                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 915                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 916                         return
 917
 918                 # Extract uploader and title from webpage
 919                 self.report_extraction(video_id)
 920                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 921                 if mobj is None:
 922                         self._downloader.trouble(u'ERROR: unable to extract video title')
 923                         return
 924                 video_title = mobj.group(1).decode('utf-8')
 925
 926                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 927                 if mobj is None:
 928                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 929                         return
 930                 video_uploader = mobj.group(1).decode('utf-8')
 931
 932                 # Extract video thumbnail
 933                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 934                 if mobj is None:
 935                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 936                         return
 937                 video_thumbnail = mobj.group(1).decode('utf-8')
 938
 939                 # Extract video description
 940                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 941                 if mobj is None:
 942                         self._downloader.trouble(u'ERROR: unable to extract video description')
 943                         return
 944                 video_description = mobj.group(1).decode('utf-8')
 945                 if not video_description:
 946                         video_description = 'No description available.'
 947
 948                 # Extract video height and width
 949                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 950                 if mobj is None:
 951                         self._downloader.trouble(u'ERROR: unable to extract video height')
 952                         return
 953                 yv_video_height = mobj.group(1)
 954
 955                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 956                 if mobj is None:
 957                         self._downloader.trouble(u'ERROR: unable to extract video width')
 958                         return
 959                 yv_video_width = mobj.group(1)
 960
 961                 # Retrieve video playlist to extract media URL
 962                 # I'm not completely sure what all these options are, but we
 963                 # seem to need most of them, otherwise the server sends a 401.
 964                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 965                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 966                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 967                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 968                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 969                 try:
 970                         self.report_download_webpage(video_id)
 971                         webpage = urllib2.urlopen(request).read()
 972                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 973                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 974                         return
 975
 976                 # Extract media URL from playlist XML
 977                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 978                 if mobj is None:
 979                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 980                         return
 981                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 982                 video_url = unescapeHTML(video_url)
 983
 984                 return [{
 985                         'id':           video_id.decode('utf-8'),
 986                         'url':          video_url,
 987                         'uploader':     video_uploader,
 988                         'upload_date':  u'NA',
 989                         'title':        video_title,
 990                         'ext':          video_extension.decode('utf-8'),
 991                         'thumbnail':    video_thumbnail.decode('utf-8'),
 992                         'description':  video_description,
 993                         'thumbnail':    video_thumbnail,
 994                         'player_url':   None,
 995                 }]
 996
 997
 998 class VimeoIE(InfoExtractor):
 999         """Information extractor for vimeo.com."""
1000
1001         # _VALID_URL matches Vimeo URLs
1002         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1003         IE_NAME = u'vimeo'
1004
1005         def __init__(self, downloader=None):
1006                 InfoExtractor.__init__(self, downloader)
1007
1008         def report_download_webpage(self, video_id):
1009                 """Report webpage download."""
1010                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1011
1012         def report_extraction(self, video_id):
1013                 """Report information extraction."""
1014                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1015
1016         def _real_extract(self, url, new_video=True):
1017                 # Extract ID from URL
1018                 mobj = re.match(self._VALID_URL, url)
1019                 if mobj is None:
1020                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1021                         return
1022
1023                 video_id = mobj.group(1)
1024
1025                 # Retrieve video webpage to extract further information
1026                 request = urllib2.Request(url, None, std_headers)
1027                 try:
1028                         self.report_download_webpage(video_id)
1029                         webpage = urllib2.urlopen(request).read()
1030                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1032                         return
1033
1034                 # Now we begin extracting as much information as we can from what we
1035                 # retrieved. First we extract the information common to all extractors,
1036                 # and latter we extract those that are Vimeo specific.
1037                 self.report_extraction(video_id)
1038
1039                 # Extract the config JSON
1040                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1041                 try:
1042                         config = json.loads(config)
1043                 except:
1044                         self._downloader.trouble(u'ERROR: unable to extract info section')
1045                         return
1046
1047                 # Extract title
1048                 video_title = config["video"]["title"]
1049
1050                 # Extract uploader
1051                 video_uploader = config["video"]["owner"]["name"]
1052
1053                 # Extract video thumbnail
1054                 video_thumbnail = config["video"]["thumbnail"]
1055
1056                 # Extract video description
1057                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1058                 if video_description: video_description = clean_html(video_description)
1059                 else: video_description = ''
1060
1061                 # Extract upload date
1062                 video_upload_date = u'NA'
1063                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1064                 if mobj is not None:
1065                         video_upload_date = mobj.group(1)
1066
1067                 # Vimeo specific: extract request signature and timestamp
1068                 sig = config['request']['signature']
1069                 timestamp = config['request']['timestamp']
1070
1071                 # Vimeo specific: extract video codec and quality information
1072                 # TODO bind to format param
1073                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1074                 for codec in codecs:
1075                         if codec[0] in config["video"]["files"]:
1076                                 video_codec = codec[0]
1077                                 video_extension = codec[1]
1078                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1079                                 else: quality = 'sd'
1080                                 break
1081                 else:
1082                         self._downloader.trouble(u'ERROR: no known codec found')
1083                         return
1084
1085                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1086                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1087
1088                 return [{
1089                         'id':           video_id,
1090                         'url':          video_url,
1091                         'uploader':     video_uploader,
1092                         'upload_date':  video_upload_date,
1093                         'title':        video_title,
1094                         'ext':          video_extension,
1095                         'thumbnail':    video_thumbnail,
1096                         'description':  video_description,
1097                         'player_url':   None,
1098                 }]
1099
1100
1101 class GenericIE(InfoExtractor):
1102         """Generic last-resort information extractor."""
1103
1104         _VALID_URL = r'.*'
1105         IE_NAME = u'generic'
1106
1107         def __init__(self, downloader=None):
1108                 InfoExtractor.__init__(self, downloader)
1109
1110         def report_download_webpage(self, video_id):
1111                 """Report webpage download."""
1112                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1113                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1114
1115         def report_extraction(self, video_id):
1116                 """Report information extraction."""
1117                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1118
1119         def report_following_redirect(self, new_url):
1120                 """Report information extraction."""
1121                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1122
1123         def _test_redirect(self, url):
1124                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1125                 class HeadRequest(urllib2.Request):
1126                         def get_method(self):
1127                                 return "HEAD"
1128
1129                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1130                         """
1131                         Subclass the HTTPRedirectHandler to make it use our
1132                         HeadRequest also on the redirected URL
1133                         """
1134                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1135                                 if code in (301, 302, 303, 307):
1136                                         newurl = newurl.replace(' ', '%20')
1137                                         newheaders = dict((k,v) for k,v in req.headers.items()
1138                                                                           if k.lower() not in ("content-length", "content-type"))
1139                                         return HeadRequest(newurl,
1140                                                                            headers=newheaders,
1141                                                                            origin_req_host=req.get_origin_req_host(),
1142                                                                            unverifiable=True)
1143                                 else:
1144                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1145
1146                 class HTTPMethodFallback(urllib2.BaseHandler):
1147                         """
1148                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1149                         """
1150                         def http_error_405(self, req, fp, code, msg, headers):
1151                                 fp.read()
1152                                 fp.close()
1153
1154                                 newheaders = dict((k,v) for k,v in req.headers.items()
1155                                                                   if k.lower() not in ("content-length", "content-type"))
1156                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1157                                                                                                  headers=newheaders,
1158                                                                                                  origin_req_host=req.get_origin_req_host(),
1159                                                                                                  unverifiable=True))
1160
1161                 # Build our opener
1162                 opener = urllib2.OpenerDirector()
1163                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1164                                                 HTTPMethodFallback, HEADRedirectHandler,
1165                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1166                         opener.add_handler(handler())
1167
1168                 response = opener.open(HeadRequest(url))
1169                 new_url = response.geturl()
1170
1171                 if url == new_url: return False
1172
1173                 self.report_following_redirect(new_url)
1174                 self._downloader.download([new_url])
1175                 return True
1176
1177         def _real_extract(self, url):
1178                 if self._test_redirect(url): return
1179
1180                 video_id = url.split('/')[-1]
1181                 request = urllib2.Request(url)
1182                 try:
1183                         self.report_download_webpage(video_id)
1184                         webpage = urllib2.urlopen(request).read()
1185                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1187                         return
1188                 except ValueError, err:
1189                         # since this is the last-resort InfoExtractor, if
1190                         # this error is thrown, it'll be thrown here
1191                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1192                         return
1193
1194                 self.report_extraction(video_id)
1195                 # Start with something easy: JW Player in SWFObject
1196                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1197                 if mobj is None:
1198                         # Broaden the search a little bit
1199                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1200                 if mobj is None:
1201                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1202                         return
1203
1204                 # It's possible that one of the regexes
1205                 # matched, but returned an empty group:
1206                 if mobj.group(1) is None:
1207                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1208                         return
1209
1210                 video_url = urllib.unquote(mobj.group(1))
1211                 video_id = os.path.basename(video_url)
1212
1213                 # here's a fun little line of code for you:
1214                 video_extension = os.path.splitext(video_id)[1][1:]
1215                 video_id = os.path.splitext(video_id)[0]
1216
1217                 # it's tempting to parse this further, but you would
1218                 # have to take into account all the variations like
1219                 #   Video Title - Site Name
1220                 #   Site Name | Video Title
1221                 #   Video Title - Tagline | Site Name
1222                 # and so on and so forth; it's just not practical
1223                 mobj = re.search(r'<title>(.*)</title>', webpage)
1224                 if mobj is None:
1225                         self._downloader.trouble(u'ERROR: unable to extract title')
1226                         return
1227                 video_title = mobj.group(1).decode('utf-8')
1228
1229                 # video uploader is domain name
1230                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract title')
1233                         return
1234                 video_uploader = mobj.group(1).decode('utf-8')
1235
1236                 return [{
1237                         'id':           video_id.decode('utf-8'),
1238                         'url':          video_url.decode('utf-8'),
1239                         'uploader':     video_uploader,
1240                         'upload_date':  u'NA',
1241                         'title':        video_title,
1242                         'ext':          video_extension.decode('utf-8'),
1243                         'format':       u'NA',
1244                         'player_url':   None,
1245                 }]
1246
1247
1248 class YoutubeSearchIE(InfoExtractor):
1249         """Information Extractor for YouTube search queries."""
1250         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1251         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1252         _max_youtube_results = 1000
1253         IE_NAME = u'youtube:search'
1254
1255         def __init__(self, downloader=None):
1256                 InfoExtractor.__init__(self, downloader)
1257
1258         def report_download_page(self, query, pagenum):
1259                 """Report attempt to download search page with given number."""
1260                 query = query.decode(preferredencoding())
1261                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1262
1263         def _real_extract(self, query):
1264                 mobj = re.match(self._VALID_URL, query)
1265                 if mobj is None:
1266                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1267                         return
1268
1269                 prefix, query = query.split(':')
1270                 prefix = prefix[8:]
1271                 query = query.encode('utf-8')
1272                 if prefix == '':
1273                         self._download_n_results(query, 1)
1274                         return
1275                 elif prefix == 'all':
1276                         self._download_n_results(query, self._max_youtube_results)
1277                         return
1278                 else:
1279                         try:
1280                                 n = long(prefix)
1281                                 if n <= 0:
1282                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1283                                         return
1284                                 elif n > self._max_youtube_results:
1285                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1286                                         n = self._max_youtube_results
1287                                 self._download_n_results(query, n)
1288                                 return
1289                         except ValueError: # parsing prefix as integer fails
1290                                 self._download_n_results(query, 1)
1291                                 return
1292
1293         def _download_n_results(self, query, n):
1294                 """Downloads a specified number of results for a query"""
1295
1296                 video_ids = []
1297                 pagenum = 0
1298                 limit = n
1299
1300                 while (50 * pagenum) < limit:
1301                         self.report_download_page(query, pagenum+1)
1302                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1303                         request = urllib2.Request(result_url)
1304                         try:
1305                                 data = urllib2.urlopen(request).read()
1306                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1307                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1308                                 return
1309                         api_response = json.loads(data)['data']
1310
1311                         new_ids = list(video['id'] for video in api_response['items'])
1312                         video_ids += new_ids
1313
1314                         limit = min(n, api_response['totalItems'])
1315                         pagenum += 1
1316
1317                 if len(video_ids) > n:
1318                         video_ids = video_ids[:n]
1319                 for id in video_ids:
1320                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1321                 return
1322
1323
1324 class GoogleSearchIE(InfoExtractor):
1325         """Information Extractor for Google Video search queries."""
1326         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1327         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1328         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1329         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1330         _max_google_results = 1000
1331         IE_NAME = u'video.google:search'
1332
1333         def __init__(self, downloader=None):
1334                 InfoExtractor.__init__(self, downloader)
1335
1336         def report_download_page(self, query, pagenum):
1337                 """Report attempt to download playlist page with given number."""
1338                 query = query.decode(preferredencoding())
1339                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1340
1341         def _real_extract(self, query):
1342                 mobj = re.match(self._VALID_URL, query)
1343                 if mobj is None:
1344                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1345                         return
1346
1347                 prefix, query = query.split(':')
1348                 prefix = prefix[8:]
1349                 query = query.encode('utf-8')
1350                 if prefix == '':
1351                         self._download_n_results(query, 1)
1352                         return
1353                 elif prefix == 'all':
1354                         self._download_n_results(query, self._max_google_results)
1355                         return
1356                 else:
1357                         try:
1358                                 n = long(prefix)
1359                                 if n <= 0:
1360                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1361                                         return
1362                                 elif n > self._max_google_results:
1363                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1364                                         n = self._max_google_results
1365                                 self._download_n_results(query, n)
1366                                 return
1367                         except ValueError: # parsing prefix as integer fails
1368                                 self._download_n_results(query, 1)
1369                                 return
1370
1371         def _download_n_results(self, query, n):
1372                 """Downloads a specified number of results for a query"""
1373
1374                 video_ids = []
1375                 pagenum = 0
1376
1377                 while True:
1378                         self.report_download_page(query, pagenum)
1379                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1380                         request = urllib2.Request(result_url)
1381                         try:
1382                                 page = urllib2.urlopen(request).read()
1383                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1384                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1385                                 return
1386
1387                         # Extract video identifiers
1388                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1389                                 video_id = mobj.group(1)
1390                                 if video_id not in video_ids:
1391                                         video_ids.append(video_id)
1392                                         if len(video_ids) == n:
1393                                                 # Specified n videos reached
1394                                                 for id in video_ids:
1395                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1396                                                 return
1397
1398                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1399                                 for id in video_ids:
1400                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1401                                 return
1402
1403                         pagenum = pagenum + 1
1404
1405
1406 class YahooSearchIE(InfoExtractor):
1407         """Information Extractor for Yahoo! Video search queries."""
1408         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1409         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1410         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1411         _MORE_PAGES_INDICATOR = r'\s*Next'
1412         _max_yahoo_results = 1000
1413         IE_NAME = u'video.yahoo:search'
1414
1415         def __init__(self, downloader=None):
1416                 InfoExtractor.__init__(self, downloader)
1417
1418         def report_download_page(self, query, pagenum):
1419                 """Report attempt to download playlist page with given number."""
1420                 query = query.decode(preferredencoding())
1421                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1422
1423         def _real_extract(self, query):
1424                 mobj = re.match(self._VALID_URL, query)
1425                 if mobj is None:
1426                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1427                         return
1428
1429                 prefix, query = query.split(':')
1430                 prefix = prefix[8:]
1431                 query = query.encode('utf-8')
1432                 if prefix == '':
1433                         self._download_n_results(query, 1)
1434                         return
1435                 elif prefix == 'all':
1436                         self._download_n_results(query, self._max_yahoo_results)
1437                         return
1438                 else:
1439                         try:
1440                                 n = long(prefix)
1441                                 if n <= 0:
1442                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1443                                         return
1444                                 elif n > self._max_yahoo_results:
1445                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1446                                         n = self._max_yahoo_results
1447                                 self._download_n_results(query, n)
1448                                 return
1449                         except ValueError: # parsing prefix as integer fails
1450                                 self._download_n_results(query, 1)
1451                                 return
1452
1453         def _download_n_results(self, query, n):
1454                 """Downloads a specified number of results for a query"""
1455
1456                 video_ids = []
1457                 already_seen = set()
1458                 pagenum = 1
1459
1460                 while True:
1461                         self.report_download_page(query, pagenum)
1462                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1463                         request = urllib2.Request(result_url)
1464                         try:
1465                                 page = urllib2.urlopen(request).read()
1466                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1468                                 return
1469
1470                         # Extract video identifiers
1471                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1472                                 video_id = mobj.group(1)
1473                                 if video_id not in already_seen:
1474                                         video_ids.append(video_id)
1475                                         already_seen.add(video_id)
1476                                         if len(video_ids) == n:
1477                                                 # Specified n videos reached
1478                                                 for id in video_ids:
1479                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1480                                                 return
1481
1482                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1483                                 for id in video_ids:
1484                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1485                                 return
1486
1487                         pagenum = pagenum + 1
1488
1489
1490 class YoutubePlaylistIE(InfoExtractor):
1491         """Information Extractor for YouTube playlists."""
1492
1493         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1494         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1495         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1496         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1497         IE_NAME = u'youtube:playlist'
1498
1499         def __init__(self, downloader=None):
1500                 InfoExtractor.__init__(self, downloader)
1501
1502         def report_download_page(self, playlist_id, pagenum):
1503                 """Report attempt to download playlist page with given number."""
1504                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1505
1506         def _real_extract(self, url):
1507                 # Extract playlist id
1508                 mobj = re.match(self._VALID_URL, url)
1509                 if mobj is None:
1510                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1511                         return
1512
1513                 # Single video case
1514                 if mobj.group(3) is not None:
1515                         self._downloader.download([mobj.group(3)])
1516                         return
1517
1518                 # Download playlist pages
1519                 # prefix is 'p' as default for playlists but there are other types that need extra care
1520                 playlist_prefix = mobj.group(1)
1521                 if playlist_prefix == 'a':
1522                         playlist_access = 'artist'
1523                 else:
1524                         playlist_prefix = 'p'
1525                         playlist_access = 'view_play_list'
1526                 playlist_id = mobj.group(2)
1527                 video_ids = []
1528                 pagenum = 1
1529
1530                 while True:
1531                         self.report_download_page(playlist_id, pagenum)
1532                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1533                         request = urllib2.Request(url)
1534                         try:
1535                                 page = urllib2.urlopen(request).read()
1536                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1537                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1538                                 return
1539
1540                         # Extract video identifiers
1541                         ids_in_page = []
1542                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1543                                 if mobj.group(1) not in ids_in_page:
1544                                         ids_in_page.append(mobj.group(1))
1545                         video_ids.extend(ids_in_page)
1546
1547                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1548                                 break
1549                         pagenum = pagenum + 1
1550
1551                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1552                 playlistend = self._downloader.params.get('playlistend', -1)
1553                 if playlistend == -1:
1554                         video_ids = video_ids[playliststart:]
1555                 else:
1556                         video_ids = video_ids[playliststart:playlistend]
1557
1558                 for id in video_ids:
1559                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1560                 return
1561
1562
1563 class YoutubeUserIE(InfoExtractor):
1564         """Information Extractor for YouTube users."""
1565
1566         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1567         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1568         _GDATA_PAGE_SIZE = 50
1569         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1570         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1571         IE_NAME = u'youtube:user'
1572
1573         def __init__(self, downloader=None):
1574                 InfoExtractor.__init__(self, downloader)
1575
1576         def report_download_page(self, username, start_index):
1577                 """Report attempt to download user page."""
1578                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1579                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1580
1581         def _real_extract(self, url):
1582                 # Extract username
1583                 mobj = re.match(self._VALID_URL, url)
1584                 if mobj is None:
1585                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1586                         return
1587
1588                 username = mobj.group(1)
1589
1590                 # Download video ids using YouTube Data API. Result size per
1591                 # query is limited (currently to 50 videos) so we need to query
1592                 # page by page until there are no video ids - it means we got
1593                 # all of them.
1594
1595                 video_ids = []
1596                 pagenum = 0
1597
1598                 while True:
1599                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1600                         self.report_download_page(username, start_index)
1601
1602                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1603
1604                         try:
1605                                 page = urllib2.urlopen(request).read()
1606                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1607                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1608                                 return
1609
1610                         # Extract video identifiers
1611                         ids_in_page = []
1612
1613                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1614                                 if mobj.group(1) not in ids_in_page:
1615                                         ids_in_page.append(mobj.group(1))
1616
1617                         video_ids.extend(ids_in_page)
1618
1619                         # A little optimization - if current page is not
1620                         # "full", ie. does not contain PAGE_SIZE video ids then
1621                         # we can assume that this page is the last one - there
1622                         # are no more ids on further pages - no need to query
1623                         # again.
1624
1625                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1626                                 break
1627
1628                         pagenum += 1
1629
1630                 all_ids_count = len(video_ids)
1631                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1632                 playlistend = self._downloader.params.get('playlistend', -1)
1633
1634                 if playlistend == -1:
1635                         video_ids = video_ids[playliststart:]
1636                 else:
1637                         video_ids = video_ids[playliststart:playlistend]
1638
1639                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1640                                 (username, all_ids_count, len(video_ids)))
1641
1642                 for video_id in video_ids:
1643                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1644
1645
1646 class BlipTVUserIE(InfoExtractor):
1647         """Information Extractor for blip.tv users."""
1648
1649         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1650         _PAGE_SIZE = 12
1651         IE_NAME = u'blip.tv:user'
1652
1653         def __init__(self, downloader=None):
1654                 InfoExtractor.__init__(self, downloader)
1655
1656         def report_download_page(self, username, pagenum):
1657                 """Report attempt to download user page."""
1658                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1659                                 (self.IE_NAME, username, pagenum))
1660
1661         def _real_extract(self, url):
1662                 # Extract username
1663                 mobj = re.match(self._VALID_URL, url)
1664                 if mobj is None:
1665                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1666                         return
1667
1668                 username = mobj.group(1)
1669
1670                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1671
1672                 request = urllib2.Request(url)
1673
1674                 try:
1675                         page = urllib2.urlopen(request).read().decode('utf-8')
1676                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1677                         page_base = page_base % mobj.group(1)
1678                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1679                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1680                         return
1681
1682
1683                 # Download video ids using BlipTV Ajax calls. Result size per
1684                 # query is limited (currently to 12 videos) so we need to query
1685                 # page by page until there are no video ids - it means we got
1686                 # all of them.
1687
1688                 video_ids = []
1689                 pagenum = 1
1690
1691                 while True:
1692                         self.report_download_page(username, pagenum)
1693
1694                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1695
1696                         try:
1697                                 page = urllib2.urlopen(request).read().decode('utf-8')
1698                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1699                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1700                                 return
1701
1702                         # Extract video identifiers
1703                         ids_in_page = []
1704
1705                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1706                                 if mobj.group(1) not in ids_in_page:
1707                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1708
1709                         video_ids.extend(ids_in_page)
1710
1711                         # A little optimization - if current page is not
1712                         # "full", ie. does not contain PAGE_SIZE video ids then
1713                         # we can assume that this page is the last one - there
1714                         # are no more ids on further pages - no need to query
1715                         # again.
1716
1717                         if len(ids_in_page) < self._PAGE_SIZE:
1718                                 break
1719
1720                         pagenum += 1
1721
1722                 all_ids_count = len(video_ids)
1723                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1724                 playlistend = self._downloader.params.get('playlistend', -1)
1725
1726                 if playlistend == -1:
1727                         video_ids = video_ids[playliststart:]
1728                 else:
1729                         video_ids = video_ids[playliststart:playlistend]
1730
1731                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1732                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1733
1734                 for video_id in video_ids:
1735                         self._downloader.download([u'http://blip.tv/'+video_id])
1736
1737
1738 class DepositFilesIE(InfoExtractor):
1739         """Information extractor for depositfiles.com"""
1740
1741         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1742         IE_NAME = u'DepositFiles'
1743
1744         def __init__(self, downloader=None):
1745                 InfoExtractor.__init__(self, downloader)
1746
1747         def report_download_webpage(self, file_id):
1748                 """Report webpage download."""
1749                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1750
1751         def report_extraction(self, file_id):
1752                 """Report information extraction."""
1753                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1754
1755         def _real_extract(self, url):
1756                 file_id = url.split('/')[-1]
1757                 # Rebuild url in english locale
1758                 url = 'http://depositfiles.com/en/files/' + file_id
1759
1760                 # Retrieve file webpage with 'Free download' button pressed
1761                 free_download_indication = { 'gateway_result' : '1' }
1762                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1763                 try:
1764                         self.report_download_webpage(file_id)
1765                         webpage = urllib2.urlopen(request).read()
1766                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1767                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1768                         return
1769
1770                 # Search for the real file URL
1771                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1772                 if (mobj is None) or (mobj.group(1) is None):
1773                         # Try to figure out reason of the error.
1774                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1775                         if (mobj is not None) and (mobj.group(1) is not None):
1776                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1777                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1778                         else:
1779                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1780                         return
1781
1782                 file_url = mobj.group(1)
1783                 file_extension = os.path.splitext(file_url)[1][1:]
1784
1785                 # Search for file title
1786                 mobj = re.search(r'<b title="(.*?)">', webpage)
1787                 if mobj is None:
1788                         self._downloader.trouble(u'ERROR: unable to extract title')
1789                         return
1790                 file_title = mobj.group(1).decode('utf-8')
1791
1792                 return [{
1793                         'id':           file_id.decode('utf-8'),
1794                         'url':          file_url.decode('utf-8'),
1795                         'uploader':     u'NA',
1796                         'upload_date':  u'NA',
1797                         'title':        file_title,
1798                         'ext':          file_extension.decode('utf-8'),
1799                         'format':       u'NA',
1800                         'player_url':   None,
1801                 }]
1802
1803
1804 class FacebookIE(InfoExtractor):
1805         """Information Extractor for Facebook"""
1806
1807         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1808         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1809         _NETRC_MACHINE = 'facebook'
1810         _available_formats = ['video', 'highqual', 'lowqual']
1811         _video_extensions = {
1812                 'video': 'mp4',
1813                 'highqual': 'mp4',
1814                 'lowqual': 'mp4',
1815         }
1816         IE_NAME = u'facebook'
1817
1818         def __init__(self, downloader=None):
1819                 InfoExtractor.__init__(self, downloader)
1820
1821         def _reporter(self, message):
1822                 """Add header and report message."""
1823                 self._downloader.to_screen(u'[facebook] %s' % message)
1824
1825         def report_login(self):
1826                 """Report attempt to log in."""
1827                 self._reporter(u'Logging in')
1828
1829         def report_video_webpage_download(self, video_id):
1830                 """Report attempt to download video webpage."""
1831                 self._reporter(u'%s: Downloading video webpage' % video_id)
1832
1833         def report_information_extraction(self, video_id):
1834                 """Report attempt to extract video information."""
1835                 self._reporter(u'%s: Extracting video information' % video_id)
1836
1837         def _parse_page(self, video_webpage):
1838                 """Extract video information from page"""
1839                 # General data
1840                 data = {'title': r'\("video_title", "(.*?)"\)',
1841                         'description': r'<div class="datawrap">(.*?)</div>',
1842                         'owner': r'\("video_owner_name", "(.*?)"\)',
1843                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1844                         }
1845                 video_info = {}
1846                 for piece in data.keys():
1847                         mobj = re.search(data[piece], video_webpage)
1848                         if mobj is not None:
1849                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1850
1851                 # Video urls
1852                 video_urls = {}
1853                 for fmt in self._available_formats:
1854                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1855                         if mobj is not None:
1856                                 # URL is in a Javascript segment inside an escaped Unicode format within
1857                                 # the generally utf-8 page
1858                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1859                 video_info['video_urls'] = video_urls
1860
1861                 return video_info
1862
1863         def _real_initialize(self):
1864                 if self._downloader is None:
1865                         return
1866
1867                 useremail = None
1868                 password = None
1869                 downloader_params = self._downloader.params
1870
1871                 # Attempt to use provided username and password or .netrc data
1872                 if downloader_params.get('username', None) is not None:
1873                         useremail = downloader_params['username']
1874                         password = downloader_params['password']
1875                 elif downloader_params.get('usenetrc', False):
1876                         try:
1877                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1878                                 if info is not None:
1879                                         useremail = info[0]
1880                                         password = info[2]
1881                                 else:
1882                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1883                         except (IOError, netrc.NetrcParseError), err:
1884                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1885                                 return
1886
1887                 if useremail is None:
1888                         return
1889
1890                 # Log in
1891                 login_form = {
1892                         'email': useremail,
1893                         'pass': password,
1894                         'login': 'Log+In'
1895                         }
1896                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1897                 try:
1898                         self.report_login()
1899                         login_results = urllib2.urlopen(request).read()
1900                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1901                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1902                                 return
1903                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1904                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1905                         return
1906
1907         def _real_extract(self, url):
1908                 mobj = re.match(self._VALID_URL, url)
1909                 if mobj is None:
1910                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1911                         return
1912                 video_id = mobj.group('ID')
1913
1914                 # Get video webpage
1915                 self.report_video_webpage_download(video_id)
1916                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1917                 try:
1918                         page = urllib2.urlopen(request)
1919                         video_webpage = page.read()
1920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1921                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1922                         return
1923
1924                 # Start extracting information
1925                 self.report_information_extraction(video_id)
1926
1927                 # Extract information
1928                 video_info = self._parse_page(video_webpage)
1929
1930                 # uploader
1931                 if 'owner' not in video_info:
1932                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1933                         return
1934                 video_uploader = video_info['owner']
1935
1936                 # title
1937                 if 'title' not in video_info:
1938                         self._downloader.trouble(u'ERROR: unable to extract video title')
1939                         return
1940                 video_title = video_info['title']
1941                 video_title = video_title.decode('utf-8')
1942
1943                 # thumbnail image
1944                 if 'thumbnail' not in video_info:
1945                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1946                         video_thumbnail = ''
1947                 else:
1948                         video_thumbnail = video_info['thumbnail']
1949
1950                 # upload date
1951                 upload_date = u'NA'
1952                 if 'upload_date' in video_info:
1953                         upload_time = video_info['upload_date']
1954                         timetuple = email.utils.parsedate_tz(upload_time)
1955                         if timetuple is not None:
1956                                 try:
1957                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1958                                 except:
1959                                         pass
1960
1961                 # description
1962                 video_description = video_info.get('description', 'No description available.')
1963
1964                 url_map = video_info['video_urls']
1965                 if len(url_map.keys()) > 0:
1966                         # Decide which formats to download
1967                         req_format = self._downloader.params.get('format', None)
1968                         format_limit = self._downloader.params.get('format_limit', None)
1969
1970                         if format_limit is not None and format_limit in self._available_formats:
1971                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1972                         else:
1973                                 format_list = self._available_formats
1974                         existing_formats = [x for x in format_list if x in url_map]
1975                         if len(existing_formats) == 0:
1976                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1977                                 return
1978                         if req_format is None:
1979                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1980                         elif req_format == 'worst':
1981                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1982                         elif req_format == '-1':
1983                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1984                         else:
1985                                 # Specific format
1986                                 if req_format not in url_map:
1987                                         self._downloader.trouble(u'ERROR: requested format not available')
1988                                         return
1989                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1990
1991                 results = []
1992                 for format_param, video_real_url in video_url_list:
1993                         # Extension
1994                         video_extension = self._video_extensions.get(format_param, 'mp4')
1995
1996                         results.append({
1997                                 'id':           video_id.decode('utf-8'),
1998                                 'url':          video_real_url.decode('utf-8'),
1999                                 'uploader':     video_uploader.decode('utf-8'),
2000                                 'upload_date':  upload_date,
2001                                 'title':        video_title,
2002                                 'ext':          video_extension.decode('utf-8'),
2003                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2004                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2005                                 'description':  video_description.decode('utf-8'),
2006                                 'player_url':   None,
2007                         })
2008                 return results
2009
2010 class BlipTVIE(InfoExtractor):
2011         """Information extractor for blip.tv"""
2012
2013         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2014         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2015         IE_NAME = u'blip.tv'
2016
2017         def report_extraction(self, file_id):
2018                 """Report information extraction."""
2019                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2020
2021         def report_direct_download(self, title):
2022                 """Report information extraction."""
2023                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2024
2025         def _real_extract(self, url):
2026                 mobj = re.match(self._VALID_URL, url)
2027                 if mobj is None:
2028                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2029                         return
2030
2031                 if '?' in url:
2032                         cchar = '&'
2033                 else:
2034                         cchar = '?'
2035                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2036                 request = urllib2.Request(json_url.encode('utf-8'))
2037                 self.report_extraction(mobj.group(1))
2038                 info = None
2039                 try:
2040                         urlh = urllib2.urlopen(request)
2041                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2042                                 basename = url.split('/')[-1]
2043                                 title,ext = os.path.splitext(basename)
2044                                 title = title.decode('UTF-8')
2045                                 ext = ext.replace('.', '')
2046                                 self.report_direct_download(title)
2047                                 info = {
2048                                         'id': title,
2049                                         'url': url,
2050                                         'title': title,
2051                                         'ext': ext,
2052                                         'urlhandle': urlh
2053                                 }
2054                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2055                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2056                         return
2057                 if info is None: # Regular URL
2058                         try:
2059                                 json_code = urlh.read()
2060                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2062                                 return
2063
2064                         try:
2065                                 json_data = json.loads(json_code)
2066                                 if 'Post' in json_data:
2067                                         data = json_data['Post']
2068                                 else:
2069                                         data = json_data
2070
2071                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2072                                 video_url = data['media']['url']
2073                                 umobj = re.match(self._URL_EXT, video_url)
2074                                 if umobj is None:
2075                                         raise ValueError('Can not determine filename extension')
2076                                 ext = umobj.group(1)
2077
2078                                 info = {
2079                                         'id': data['item_id'],
2080                                         'url': video_url,
2081                                         'uploader': data['display_name'],
2082                                         'upload_date': upload_date,
2083                                         'title': data['title'],
2084                                         'ext': ext,
2085                                         'format': data['media']['mimeType'],
2086                                         'thumbnail': data['thumbnailUrl'],
2087                                         'description': data['description'],
2088                                         'player_url': data['embedUrl']
2089                                 }
2090                         except (ValueError,KeyError), err:
2091                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2092                                 return
2093
2094                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2095                 return [info]
2096
2097
2098 class MyVideoIE(InfoExtractor):
2099         """Information Extractor for myvideo.de."""
2100
2101         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2102         IE_NAME = u'myvideo'
2103
2104         def __init__(self, downloader=None):
2105                 InfoExtractor.__init__(self, downloader)
2106
2107         def report_download_webpage(self, video_id):
2108                 """Report webpage download."""
2109                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2110
2111         def report_extraction(self, video_id):
2112                 """Report information extraction."""
2113                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2114
2115         def _real_extract(self,url):
2116                 mobj = re.match(self._VALID_URL, url)
2117                 if mobj is None:
2118                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2119                         return
2120
2121                 video_id = mobj.group(1)
2122
2123                 # Get video webpage
2124                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2125                 try:
2126                         self.report_download_webpage(video_id)
2127                         webpage = urllib2.urlopen(request).read()
2128                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2129                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2130                         return
2131
2132                 self.report_extraction(video_id)
2133                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2134                                  webpage)
2135                 if mobj is None:
2136                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2137                         return
2138                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2139
2140                 mobj = re.search('<title>([^<]+)</title>', webpage)
2141                 if mobj is None:
2142                         self._downloader.trouble(u'ERROR: unable to extract title')
2143                         return
2144
2145                 video_title = mobj.group(1)
2146
2147                 return [{
2148                         'id':           video_id,
2149                         'url':          video_url,
2150                         'uploader':     u'NA',
2151                         'upload_date':  u'NA',
2152                         'title':        video_title,
2153                         'ext':          u'flv',
2154                         'format':       u'NA',
2155                         'player_url':   None,
2156                 }]
2157
2158 class ComedyCentralIE(InfoExtractor):
2159         """Information extractor for The Daily Show and Colbert Report """
2160
2161         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2162         IE_NAME = u'comedycentral'
2163
2164         def report_extraction(self, episode_id):
2165                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2166
2167         def report_config_download(self, episode_id):
2168                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2169
2170         def report_index_download(self, episode_id):
2171                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2172
2173         def report_player_url(self, episode_id):
2174                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2175
2176         def _real_extract(self, url):
2177                 mobj = re.match(self._VALID_URL, url)
2178                 if mobj is None:
2179                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2180                         return
2181
2182                 if mobj.group('shortname'):
2183                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2184                                 url = u'http://www.thedailyshow.com/full-episodes/'
2185                         else:
2186                                 url = u'http://www.colbertnation.com/full-episodes/'
2187                         mobj = re.match(self._VALID_URL, url)
2188                         assert mobj is not None
2189
2190                 dlNewest = not mobj.group('episode')
2191                 if dlNewest:
2192                         epTitle = mobj.group('showname')
2193                 else:
2194                         epTitle = mobj.group('episode')
2195
2196                 req = urllib2.Request(url)
2197                 self.report_extraction(epTitle)
2198                 try:
2199                         htmlHandle = urllib2.urlopen(req)
2200                         html = htmlHandle.read()
2201                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2203                         return
2204                 if dlNewest:
2205                         url = htmlHandle.geturl()
2206                         mobj = re.match(self._VALID_URL, url)
2207                         if mobj is None:
2208                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2209                                 return
2210                         if mobj.group('episode') == '':
2211                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2212                                 return
2213                         epTitle = mobj.group('episode')
2214
2215                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2216                 if len(mMovieParams) == 0:
2217                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2218                         return
2219
2220                 playerUrl_raw = mMovieParams[0][0]
2221                 self.report_player_url(epTitle)
2222                 try:
2223                         urlHandle = urllib2.urlopen(playerUrl_raw)
2224                         playerUrl = urlHandle.geturl()
2225                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2227                         return
2228
2229                 uri = mMovieParams[0][1]
2230                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2231                 self.report_index_download(epTitle)
2232                 try:
2233                         indexXml = urllib2.urlopen(indexUrl).read()
2234                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2236                         return
2237
2238                 results = []
2239
2240                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2241                 itemEls = idoc.findall('.//item')
2242                 for itemEl in itemEls:
2243                         mediaId = itemEl.findall('./guid')[0].text
2244                         shortMediaId = mediaId.split(':')[-1]
2245                         showId = mediaId.split(':')[-2].replace('.com', '')
2246                         officialTitle = itemEl.findall('./title')[0].text
2247                         officialDate = itemEl.findall('./pubDate')[0].text
2248
2249                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2250                                                 urllib.urlencode({'uri': mediaId}))
2251                         configReq = urllib2.Request(configUrl)
2252                         self.report_config_download(epTitle)
2253                         try:
2254                                 configXml = urllib2.urlopen(configReq).read()
2255                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2256                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2257                                 return
2258
2259                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2260                         turls = []
2261                         for rendition in cdoc.findall('.//rendition'):
2262                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2263                                 turls.append(finfo)
2264
2265                         if len(turls) == 0:
2266                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2267                                 continue
2268
2269                         # For now, just pick the highest bitrate
2270                         format,video_url = turls[-1]
2271
2272                         effTitle = showId + u'-' + epTitle
2273                         info = {
2274                                 'id': shortMediaId,
2275                                 'url': video_url,
2276                                 'uploader': showId,
2277                                 'upload_date': officialDate,
2278                                 'title': effTitle,
2279                                 'ext': 'mp4',
2280                                 'format': format,
2281                                 'thumbnail': None,
2282                                 'description': officialTitle,
2283                                 'player_url': playerUrl
2284                         }
2285
2286                         results.append(info)
2287
2288                 return results
2289
2290
2291 class EscapistIE(InfoExtractor):
2292         """Information extractor for The Escapist """
2293
2294         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2295         IE_NAME = u'escapist'
2296
2297         def report_extraction(self, showName):
2298                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2299
2300         def report_config_download(self, showName):
2301                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2302
2303         def _real_extract(self, url):
2304                 mobj = re.match(self._VALID_URL, url)
2305                 if mobj is None:
2306                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2307                         return
2308                 showName = mobj.group('showname')
2309                 videoId = mobj.group('episode')
2310
2311                 self.report_extraction(showName)
2312                 try:
2313                         webPage = urllib2.urlopen(url)
2314                         webPageBytes = webPage.read()
2315                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2316                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2317                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2318                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2319                         return
2320
2321                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2322                 description = unescapeHTML(descMatch.group(1))
2323                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2324                 imgUrl = unescapeHTML(imgMatch.group(1))
2325                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2326                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2327                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2328                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2329
2330                 self.report_config_download(showName)
2331                 try:
2332                         configJSON = urllib2.urlopen(configUrl).read()
2333                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2334                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2335                         return
2336
2337                 # Technically, it's JavaScript, not JSON
2338                 configJSON = configJSON.replace("'", '"')
2339
2340                 try:
2341                         config = json.loads(configJSON)
2342                 except (ValueError,), err:
2343                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2344                         return
2345
2346                 playlist = config['playlist']
2347                 videoUrl = playlist[1]['url']
2348
2349                 info = {
2350                         'id': videoId,
2351                         'url': videoUrl,
2352                         'uploader': showName,
2353                         'upload_date': None,
2354                         'title': showName,
2355                         'ext': 'flv',
2356                         'format': 'flv',
2357                         'thumbnail': imgUrl,
2358                         'description': description,
2359                         'player_url': playerUrl,
2360                 }
2361
2362                 return [info]
2363
2364
2365 class CollegeHumorIE(InfoExtractor):
2366         """Information extractor for collegehumor.com"""
2367
2368         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2369         IE_NAME = u'collegehumor'
2370
2371         def report_webpage(self, video_id):
2372                 """Report information extraction."""
2373                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2374
2375         def report_extraction(self, video_id):
2376                 """Report information extraction."""
2377                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2378
2379         def _real_extract(self, url):
2380                 mobj = re.match(self._VALID_URL, url)
2381                 if mobj is None:
2382                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2383                         return
2384                 video_id = mobj.group('videoid')
2385
2386                 self.report_webpage(video_id)
2387                 request = urllib2.Request(url)
2388                 try:
2389                         webpage = urllib2.urlopen(request).read()
2390                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2392                         return
2393
2394                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2395                 if m is None:
2396                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2397                         return
2398                 internal_video_id = m.group('internalvideoid')
2399
2400                 info = {
2401                         'id': video_id,
2402                         'internal_id': internal_video_id,
2403                 }
2404
2405                 self.report_extraction(video_id)
2406                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2407                 try:
2408                         metaXml = urllib2.urlopen(xmlUrl).read()
2409                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2410                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2411                         return
2412
2413                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2414                 try:
2415                         videoNode = mdoc.findall('./video')[0]
2416                         info['description'] = videoNode.findall('./description')[0].text
2417                         info['title'] = videoNode.findall('./caption')[0].text
2418                         info['url'] = videoNode.findall('./file')[0].text
2419                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2420                         info['ext'] = info['url'].rpartition('.')[2]
2421                         info['format'] = info['ext']
2422                 except IndexError:
2423                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2424                         return
2425
2426                 return [info]
2427
2428
2429 class XVideosIE(InfoExtractor):
2430         """Information extractor for xvideos.com"""
2431
2432         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2433         IE_NAME = u'xvideos'
2434
2435         def report_webpage(self, video_id):
2436                 """Report information extraction."""
2437                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2438
2439         def report_extraction(self, video_id):
2440                 """Report information extraction."""
2441                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2442
2443         def _real_extract(self, url):
2444                 mobj = re.match(self._VALID_URL, url)
2445                 if mobj is None:
2446                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2447                         return
2448                 video_id = mobj.group(1).decode('utf-8')
2449
2450                 self.report_webpage(video_id)
2451
2452                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2453                 try:
2454                         webpage = urllib2.urlopen(request).read()
2455                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2456                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2457                         return
2458
2459                 self.report_extraction(video_id)
2460
2461
2462                 # Extract video URL
2463                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2464                 if mobj is None:
2465                         self._downloader.trouble(u'ERROR: unable to extract video url')
2466                         return
2467                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2468
2469
2470                 # Extract title
2471                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2472                 if mobj is None:
2473                         self._downloader.trouble(u'ERROR: unable to extract video title')
2474                         return
2475                 video_title = mobj.group(1).decode('utf-8')
2476
2477
2478                 # Extract video thumbnail
2479                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2480                 if mobj is None:
2481                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2482                         return
2483                 video_thumbnail = mobj.group(0).decode('utf-8')
2484
2485                 info = {
2486                         'id': video_id,
2487                         'url': video_url,
2488                         'uploader': None,
2489                         'upload_date': None,
2490                         'title': video_title,
2491                         'ext': 'flv',
2492                         'format': 'flv',
2493                         'thumbnail': video_thumbnail,
2494                         'description': None,
2495                         'player_url': None,
2496                 }
2497
2498                 return [info]
2499
2500
2501 class SoundcloudIE(InfoExtractor):
2502         """Information extractor for soundcloud.com
2503            To access the media, the uid of the song and a stream token
2504            must be extracted from the page source and the script must make
2505            a request to media.soundcloud.com/crossdomain.xml. Then
2506            the media can be grabbed by requesting from an url composed
2507            of the stream token and uid
2508          """
2509
2510         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2511         IE_NAME = u'soundcloud'
2512
2513         def __init__(self, downloader=None):
2514                 InfoExtractor.__init__(self, downloader)
2515
2516         def report_webpage(self, video_id):
2517                 """Report information extraction."""
2518                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519
2520         def report_extraction(self, video_id):
2521                 """Report information extraction."""
2522                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523
2524         def _real_extract(self, url):
2525                 mobj = re.match(self._VALID_URL, url)
2526                 if mobj is None:
2527                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2528                         return
2529
2530                 # extract uploader (which is in the url)
2531                 uploader = mobj.group(1).decode('utf-8')
2532                 # extract simple title (uploader + slug of song title)
2533                 slug_title =  mobj.group(2).decode('utf-8')
2534                 simple_title = uploader + u'-' + slug_title
2535
2536                 self.report_webpage('%s/%s' % (uploader, slug_title))
2537
2538                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2539                 try:
2540                         webpage = urllib2.urlopen(request).read()
2541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2543                         return
2544
2545                 self.report_extraction('%s/%s' % (uploader, slug_title))
2546
2547                 # extract uid and stream token that soundcloud hands out for access
2548                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2549                 if mobj:
2550                         video_id = mobj.group(1)
2551                         stream_token = mobj.group(2)
2552
2553                 # extract unsimplified title
2554                 mobj = re.search('"title":"(.*?)",', webpage)
2555                 if mobj:
2556                         title = mobj.group(1).decode('utf-8')
2557                 else:
2558                         title = simple_title
2559
2560                 # construct media url (with uid/token)
2561                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2562                 mediaURL = mediaURL % (video_id, stream_token)
2563
2564                 # description
2565                 description = u'No description available'
2566                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2567                 if mobj:
2568                         description = mobj.group(1)
2569
2570                 # upload date
2571                 upload_date = None
2572                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2573                 if mobj:
2574                         try:
2575                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2576                         except Exception, e:
2577                                 self._downloader.to_stderr(str(e))
2578
2579                 # for soundcloud, a request to a cross domain is required for cookies
2580                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2581
2582                 return [{
2583                         'id':           video_id.decode('utf-8'),
2584                         'url':          mediaURL,
2585                         'uploader':     uploader.decode('utf-8'),
2586                         'upload_date':  upload_date,
2587                         'title':        title,
2588                         'ext':          u'mp3',
2589                         'format':       u'NA',
2590                         'player_url':   None,
2591                         'description': description.decode('utf-8')
2592                 }]
2593
2594
2595 class InfoQIE(InfoExtractor):
2596         """Information extractor for infoq.com"""
2597
2598         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2599         IE_NAME = u'infoq'
2600
2601         def report_webpage(self, video_id):
2602                 """Report information extraction."""
2603                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2604
2605         def report_extraction(self, video_id):
2606                 """Report information extraction."""
2607                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2608
2609         def _real_extract(self, url):
2610                 mobj = re.match(self._VALID_URL, url)
2611                 if mobj is None:
2612                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2613                         return
2614
2615                 self.report_webpage(url)
2616
2617                 request = urllib2.Request(url)
2618                 try:
2619                         webpage = urllib2.urlopen(request).read()
2620                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2622                         return
2623
2624                 self.report_extraction(url)
2625
2626
2627                 # Extract video URL
2628                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2629                 if mobj is None:
2630                         self._downloader.trouble(u'ERROR: unable to extract video url')
2631                         return
2632                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2633
2634
2635                 # Extract title
2636                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2637                 if mobj is None:
2638                         self._downloader.trouble(u'ERROR: unable to extract video title')
2639                         return
2640                 video_title = mobj.group(1).decode('utf-8')
2641
2642                 # Extract description
2643                 video_description = u'No description available.'
2644                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2645                 if mobj is not None:
2646                         video_description = mobj.group(1).decode('utf-8')
2647
2648                 video_filename = video_url.split('/')[-1]
2649                 video_id, extension = video_filename.split('.')
2650
2651                 info = {
2652                         'id': video_id,
2653                         'url': video_url,
2654                         'uploader': None,
2655                         'upload_date': None,
2656                         'title': video_title,
2657                         'ext': extension,
2658                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2659                         'thumbnail': None,
2660                         'description': video_description,
2661                         'player_url': None,
2662                 }
2663
2664                 return [info]
2665
2666 class MixcloudIE(InfoExtractor):
2667         """Information extractor for www.mixcloud.com"""
2668         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2669         IE_NAME = u'mixcloud'
2670
2671         def __init__(self, downloader=None):
2672                 InfoExtractor.__init__(self, downloader)
2673
2674         def report_download_json(self, file_id):
2675                 """Report JSON download."""
2676                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2677
2678         def report_extraction(self, file_id):
2679                 """Report information extraction."""
2680                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2681
2682         def get_urls(self, jsonData, fmt, bitrate='best'):
2683                 """Get urls from 'audio_formats' section in json"""
2684                 file_url = None
2685                 try:
2686                         bitrate_list = jsonData[fmt]
2687                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2688                                 bitrate = max(bitrate_list) # select highest
2689
2690                         url_list = jsonData[fmt][bitrate]
2691                 except TypeError: # we have no bitrate info.
2692                         url_list = jsonData[fmt]
2693                 return url_list
2694
2695         def check_urls(self, url_list):
2696                 """Returns 1st active url from list"""
2697                 for url in url_list:
2698                         try:
2699                                 urllib2.urlopen(url)
2700                                 return url
2701                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2702                                 url = None
2703
2704                 return None
2705
2706         def _print_formats(self, formats):
2707                 print 'Available formats:'
2708                 for fmt in formats.keys():
2709                         for b in formats[fmt]:
2710                                 try:
2711                                         ext = formats[fmt][b][0]
2712                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2713                                 except TypeError: # we have no bitrate info
2714                                         ext = formats[fmt][0]
2715                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2716                                         break
2717
2718         def _real_extract(self, url):
2719                 mobj = re.match(self._VALID_URL, url)
2720                 if mobj is None:
2721                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2722                         return
2723                 # extract uploader & filename from url
2724                 uploader = mobj.group(1).decode('utf-8')
2725                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2726
2727                 # construct API request
2728                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2729                 # retrieve .json file with links to files
2730                 request = urllib2.Request(file_url)
2731                 try:
2732                         self.report_download_json(file_url)
2733                         jsonData = urllib2.urlopen(request).read()
2734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2735                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2736                         return
2737
2738                 # parse JSON
2739                 json_data = json.loads(jsonData)
2740                 player_url = json_data['player_swf_url']
2741                 formats = dict(json_data['audio_formats'])
2742
2743                 req_format = self._downloader.params.get('format', None)
2744                 bitrate = None
2745
2746                 if self._downloader.params.get('listformats', None):
2747                         self._print_formats(formats)
2748                         return
2749
2750                 if req_format is None or req_format == 'best':
2751                         for format_param in formats.keys():
2752                                 url_list = self.get_urls(formats, format_param)
2753                                 # check urls
2754                                 file_url = self.check_urls(url_list)
2755                                 if file_url is not None:
2756                                         break # got it!
2757                 else:
2758                         if req_format not in formats.keys():
2759                                 self._downloader.trouble(u'ERROR: format is not available')
2760                                 return
2761
2762                         url_list = self.get_urls(formats, req_format)
2763                         file_url = self.check_urls(url_list)
2764                         format_param = req_format
2765
2766                 return [{
2767                         'id': file_id.decode('utf-8'),
2768                         'url': file_url.decode('utf-8'),
2769                         'uploader':     uploader.decode('utf-8'),
2770                         'upload_date': u'NA',
2771                         'title': json_data['name'],
2772                         'ext': file_url.split('.')[-1].decode('utf-8'),
2773                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2774                         'thumbnail': json_data['thumbnail_url'],
2775                         'description': json_data['description'],
2776                         'player_url': player_url.decode('utf-8'),
2777                 }]
2778
2779 class StanfordOpenClassroomIE(InfoExtractor):
2780         """Information extractor for Stanford's Open ClassRoom"""
2781
2782         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2783         IE_NAME = u'stanfordoc'
2784
2785         def report_download_webpage(self, objid):
2786                 """Report information extraction."""
2787                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2788
2789         def report_extraction(self, video_id):
2790                 """Report information extraction."""
2791                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2792
2793         def _real_extract(self, url):
2794                 mobj = re.match(self._VALID_URL, url)
2795                 if mobj is None:
2796                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797                         return
2798
2799                 if mobj.group('course') and mobj.group('video'): # A specific video
2800                         course = mobj.group('course')
2801                         video = mobj.group('video')
2802                         info = {
2803                                 'id': course + '_' + video,
2804                         }
2805
2806                         self.report_extraction(info['id'])
2807                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808                         xmlUrl = baseUrl + video + '.xml'
2809                         try:
2810                                 metaXml = urllib2.urlopen(xmlUrl).read()
2811                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2812                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2813                                 return
2814                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2815                         try:
2816                                 info['title'] = mdoc.findall('./title')[0].text
2817                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2818                         except IndexError:
2819                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2820                                 return
2821                         info['ext'] = info['url'].rpartition('.')[2]
2822                         info['format'] = info['ext']
2823                         return [info]
2824                 elif mobj.group('course'): # A course page
2825                         course = mobj.group('course')
2826                         info = {
2827                                 'id': course,
2828                                 'type': 'playlist',
2829                         }
2830
2831                         self.report_download_webpage(info['id'])
2832                         try:
2833                                 coursepage = urllib2.urlopen(url).read()
2834                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2835                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2836                                 return
2837
2838                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2839                         if m:
2840                                 info['title'] = unescapeHTML(m.group(1))
2841                         else:
2842                                 info['title'] = info['id']
2843
2844                         m = re.search('<description>([^<]+)</description>', coursepage)
2845                         if m:
2846                                 info['description'] = unescapeHTML(m.group(1))
2847
2848                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2849                         info['list'] = [
2850                                 {
2851                                         'type': 'reference',
2852                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2853                                 }
2854                                         for vpage in links]
2855                         results = []
2856                         for entry in info['list']:
2857                                 assert entry['type'] == 'reference'
2858                                 results += self.extract(entry['url'])
2859                         return results
2860
2861                 else: # Root page
2862                         info = {
2863                                 'id': 'Stanford OpenClassroom',
2864                                 'type': 'playlist',
2865                         }
2866
2867                         self.report_download_webpage(info['id'])
2868                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869                         try:
2870                                 rootpage = urllib2.urlopen(rootURL).read()
2871                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2873                                 return
2874
2875                         info['title'] = info['id']
2876
2877                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2878                         info['list'] = [
2879                                 {
2880                                         'type': 'reference',
2881                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2882                                 }
2883                                         for cpage in links]
2884
2885                         results = []
2886                         for entry in info['list']:
2887                                 assert entry['type'] == 'reference'
2888                                 results += self.extract(entry['url'])
2889                         return results
2890
2891 class MTVIE(InfoExtractor):
2892         """Information extractor for MTV.com"""
2893
2894         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2895         IE_NAME = u'mtv'
2896
2897         def report_webpage(self, video_id):
2898                 """Report information extraction."""
2899                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2900
2901         def report_extraction(self, video_id):
2902                 """Report information extraction."""
2903                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2904
2905         def _real_extract(self, url):
2906                 mobj = re.match(self._VALID_URL, url)
2907                 if mobj is None:
2908                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2909                         return
2910                 if not mobj.group('proto'):
2911                         url = 'http://' + url
2912                 video_id = mobj.group('videoid')
2913                 self.report_webpage(video_id)
2914
2915                 request = urllib2.Request(url)
2916                 try:
2917                         webpage = urllib2.urlopen(request).read()
2918                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2919                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2920                         return
2921
2922                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2923                 if mobj is None:
2924                         self._downloader.trouble(u'ERROR: unable to extract song name')
2925                         return
2926                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2927                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2928                 if mobj is None:
2929                         self._downloader.trouble(u'ERROR: unable to extract performer')
2930                         return
2931                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2932                 video_title = performer + ' - ' + song_name
2933
2934                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2935                 if mobj is None:
2936                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2937                         return
2938                 mtvn_uri = mobj.group(1)
2939
2940                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2941                 if mobj is None:
2942                         self._downloader.trouble(u'ERROR: unable to extract content id')
2943                         return
2944                 content_id = mobj.group(1)
2945
2946                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2947                 self.report_extraction(video_id)
2948                 request = urllib2.Request(videogen_url)
2949                 try:
2950                         metadataXml = urllib2.urlopen(request).read()
2951                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2952                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2953                         return
2954
2955                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2956                 renditions = mdoc.findall('.//rendition')
2957
2958                 # For now, always pick the highest quality.
2959                 rendition = renditions[-1]
2960
2961                 try:
2962                         _,_,ext = rendition.attrib['type'].partition('/')
2963                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2964                         video_url = rendition.find('./src').text
2965                 except KeyError:
2966                         self._downloader.trouble('Invalid rendition field.')
2967                         return
2968
2969                 info = {
2970                         'id': video_id,
2971                         'url': video_url,
2972                         'uploader': performer,
2973                         'title': video_title,
2974                         'ext': ext,
2975                         'format': format,
2976                 }
2977
2978                 return [info]