_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         ext:            Video filename extension.
  43         format:         Video format.
  44         player_url:     SWF Player URL (may be None).
  45
  46         The following fields are optional. Their primary purpose is to allow
  47         youtube-dl to serve as the backend for a video search function, such
  48         as the one in youtube2mp3.  They are only used when their respective
  49         forced printing functions are called:
  50
  51         thumbnail:      Full URL to a video thumbnail image.
  52         description:    One-line video description.
  53
  54         Subclasses of this one should re-define the _real_initialize() and
  55         _real_extract() methods and define a _VALID_URL regexp.
  56         Probably, they should also be added to the list of extractors.
  57         """
  58
  59         _ready = False
  60         _downloader = None
  61
  62         def __init__(self, downloader=None):
  63                 """Constructor. Receives an optional downloader."""
  64                 self._ready = False
  65                 self.set_downloader(downloader)
  66
  67         def suitable(self, url):
  68                 """Receives a URL and returns True if suitable for this IE."""
  69                 return re.match(self._VALID_URL, url) is not None
  70
  71         def initialize(self):
  72                 """Initializes an instance (authentication, etc)."""
  73                 if not self._ready:
  74                         self._real_initialize()
  75                         self._ready = True
  76
  77         def extract(self, url):
  78                 """Extracts URL information and returns it in list of dicts."""
  79                 self.initialize()
  80                 return self._real_extract(url)
  81
  82         def set_downloader(self, downloader):
  83                 """Sets the downloader for this IE."""
  84                 self._downloader = downloader
  85
  86         def _real_initialize(self):
  87                 """Real initialization process. Redefine in subclasses."""
  88                 pass
  89
  90         def _real_extract(self, url):
  91                 """Real extraction process. Redefine in subclasses."""
  92                 pass
  93
  94
  95 class YoutubeIE(InfoExtractor):
  96         """Information extractor for youtube.com."""
  97
  98         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
  99         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 100         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 101         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 102         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 103         _NETRC_MACHINE = 'youtube'
 104         # Listed in order of quality
 105         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 106         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 107         _video_extensions = {
 108                 '13': '3gp',
 109                 '17': 'mp4',
 110                 '18': 'mp4',
 111                 '22': 'mp4',
 112                 '37': 'mp4',
 113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 114                 '43': 'webm',
 115                 '44': 'webm',
 116                 '45': 'webm',
 117                 '46': 'webm',
 118         }
 119         _video_dimensions = {
 120                 '5': '240x400',
 121                 '6': '???',
 122                 '13': '???',
 123                 '17': '144x176',
 124                 '18': '360x640',
 125                 '22': '720x1280',
 126                 '34': '360x640',
 127                 '35': '480x854',
 128                 '37': '1080x1920',
 129                 '38': '3072x4096',
 130                 '43': '360x640',
 131                 '44': '480x854',
 132                 '45': '720x1280',
 133                 '46': '1080x1920',
 134         }
 135         IE_NAME = u'youtube'
 136
 137         def report_lang(self):
 138                 """Report attempt to set language."""
 139                 self._downloader.to_screen(u'[youtube] Setting language')
 140
 141         def report_login(self):
 142                 """Report attempt to log in."""
 143                 self._downloader.to_screen(u'[youtube] Logging in')
 144
 145         def report_age_confirmation(self):
 146                 """Report attempt to confirm age."""
 147                 self._downloader.to_screen(u'[youtube] Confirming age')
 148
 149         def report_video_webpage_download(self, video_id):
 150                 """Report attempt to download video webpage."""
 151                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 152
 153         def report_video_info_webpage_download(self, video_id):
 154                 """Report attempt to download video info webpage."""
 155                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 156
 157         def report_video_subtitles_download(self, video_id):
 158                 """Report attempt to download video info webpage."""
 159                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 160
 161         def report_information_extraction(self, video_id):
 162                 """Report attempt to extract video information."""
 163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 164
 165         def report_unavailable_format(self, video_id, format):
 166                 """Report extracted video URL."""
 167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 168
 169         def report_rtmp_download(self):
 170                 """Indicate the download will use the RTMP protocol."""
 171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 172
 173         def _closed_captions_xml_to_srt(self, xml_string):
 174                 srt = ''
 175                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 176                 # TODO parse xml instead of regex
 177                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 178                         if not dur: dur = '4'
 179                         start = float(start)
 180                         end = start + float(dur)
 181                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 182                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 183                         caption = unescapeHTML(caption)
 184                         caption = unescapeHTML(caption) # double cycle, intentional
 185                         srt += str(n+1) + '\n'
 186                         srt += start + ' --> ' + end + '\n'
 187                         srt += caption + '\n\n'
 188                 return srt
 189
 190         def _print_formats(self, formats):
 191                 print 'Available formats:'
 192                 for x in formats:
 193                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 194
 195         def _real_initialize(self):
 196                 if self._downloader is None:
 197                         return
 198
 199                 username = None
 200                 password = None
 201                 downloader_params = self._downloader.params
 202
 203                 # Attempt to use provided username and password or .netrc data
 204                 if downloader_params.get('username', None) is not None:
 205                         username = downloader_params['username']
 206                         password = downloader_params['password']
 207                 elif downloader_params.get('usenetrc', False):
 208                         try:
 209                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 210                                 if info is not None:
 211                                         username = info[0]
 212                                         password = info[2]
 213                                 else:
 214                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 215                         except (IOError, netrc.NetrcParseError), err:
 216                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 217                                 return
 218
 219                 # Set language
 220                 request = urllib2.Request(self._LANG_URL)
 221                 try:
 222                         self.report_lang()
 223                         urllib2.urlopen(request).read()
 224                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 225                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 226                         return
 227
 228                 # No authentication to be performed
 229                 if username is None:
 230                         return
 231
 232                 # Log in
 233                 login_form = {
 234                                 'current_form': 'loginForm',
 235                                 'next':         '/',
 236                                 'action_login': 'Log In',
 237                                 'username':     username,
 238                                 'password':     password,
 239                                 }
 240                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 241                 try:
 242                         self.report_login()
 243                         login_results = urllib2.urlopen(request).read()
 244                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 245                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 246                                 return
 247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 248                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 249                         return
 250
 251                 # Confirm age
 252                 age_form = {
 253                                 'next_url':             '/',
 254                                 'action_confirm':       'Confirm',
 255                                 }
 256                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 257                 try:
 258                         self.report_age_confirmation()
 259                         age_results = urllib2.urlopen(request).read()
 260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 261                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 262                         return
 263
 264         def _real_extract(self, url):
 265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 266                 mobj = re.search(self._NEXT_URL_RE, url)
 267                 if mobj:
 268                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 269
 270                 # Extract video id from URL
 271                 mobj = re.match(self._VALID_URL, url)
 272                 if mobj is None:
 273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 274                         return
 275                 video_id = mobj.group(2)
 276
 277                 # Get video webpage
 278                 self.report_video_webpage_download(video_id)
 279                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 280                 try:
 281                         video_webpage = urllib2.urlopen(request).read()
 282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 283                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 284                         return
 285
 286                 # Attempt to extract SWF player URL
 287                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 288                 if mobj is not None:
 289                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 290                 else:
 291                         player_url = None
 292
 293                 # Get video info
 294                 self.report_video_info_webpage_download(video_id)
 295                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 296                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 297                                         % (video_id, el_type))
 298                         request = urllib2.Request(video_info_url)
 299                         try:
 300                                 video_info_webpage = urllib2.urlopen(request).read()
 301                                 video_info = parse_qs(video_info_webpage)
 302                                 if 'token' in video_info:
 303                                         break
 304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 305                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 306                                 return
 307                 if 'token' not in video_info:
 308                         if 'reason' in video_info:
 309                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 310                         else:
 311                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 312                         return
 313
 314                 # Check for "rental" videos
 315                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 316                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 317                         return
 318
 319                 # Start extracting information
 320                 self.report_information_extraction(video_id)
 321
 322                 # uploader
 323                 if 'author' not in video_info:
 324                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 325                         return
 326                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 327
 328                 # title
 329                 if 'title' not in video_info:
 330                         self._downloader.trouble(u'ERROR: unable to extract video title')
 331                         return
 332                 video_title = urllib.unquote_plus(video_info['title'][0])
 333                 video_title = video_title.decode('utf-8')
 334
 335                 # thumbnail image
 336                 if 'thumbnail_url' not in video_info:
 337                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 338                         video_thumbnail = ''
 339                 else:   # don't panic if we can't find it
 340                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 341
 342                 # upload date
 343                 upload_date = u'NA'
 344                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 345                 if mobj is not None:
 346                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 347                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 348                         for expression in format_expressions:
 349                                 try:
 350                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 351                                 except:
 352                                         pass
 353
 354                 # description
 355                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 356                 if video_description: video_description = clean_html(video_description)
 357                 else: video_description = ''
 358
 359                 # closed captions
 360                 video_subtitles = None
 361                 if self._downloader.params.get('writesubtitles', False):
 362                         try:
 363                                 self.report_video_subtitles_download(video_id)
 364                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 365                                 try:
 366                                         srt_list = urllib2.urlopen(request).read()
 367                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 368                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 369                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 370                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 371                                 if not srt_lang_list:
 372                                         raise Trouble(u'WARNING: video has no closed captions')
 373                                 if self._downloader.params.get('subtitleslang', False):
 374                                         srt_lang = self._downloader.params.get('subtitleslang')
 375                                 elif 'en' in srt_lang_list:
 376                                         srt_lang = 'en'
 377                                 else:
 378                                         srt_lang = srt_lang_list.keys()[0]
 379                                 if not srt_lang in srt_lang_list:
 380                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 381                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 382                                 try:
 383                                         srt_xml = urllib2.urlopen(request).read()
 384                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 385                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 386                                 if not srt_xml:
 387                                         raise Trouble(u'WARNING: unable to download video subtitles')
 388                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 389                         except Trouble as trouble:
 390                                 self._downloader.trouble(trouble[0])
 391
 392                 # token
 393                 video_token = urllib.unquote_plus(video_info['token'][0])
 394
 395                 # Decide which formats to download
 396                 req_format = self._downloader.params.get('format', None)
 397
 398                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 399                         self.report_rtmp_download()
 400                         video_url_list = [(None, video_info['conn'][0])]
 401                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 402                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 403                         url_data = [parse_qs(uds) for uds in url_data_strs]
 404                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 405                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 406
 407                         format_limit = self._downloader.params.get('format_limit', None)
 408                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 409                         if format_limit is not None and format_limit in available_formats:
 410                                 format_list = available_formats[available_formats.index(format_limit):]
 411                         else:
 412                                 format_list = available_formats
 413                         existing_formats = [x for x in format_list if x in url_map]
 414                         if len(existing_formats) == 0:
 415                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 416                                 return
 417                         if self._downloader.params.get('listformats', None):
 418                                 self._print_formats(existing_formats)
 419                                 return
 420                         if req_format is None or req_format == 'best':
 421                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 422                         elif req_format == 'worst':
 423                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 424                         elif req_format in ('-1', 'all'):
 425                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 426                         else:
 427                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 428                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 429                                 req_formats = req_format.split('/')
 430                                 video_url_list = None
 431                                 for rf in req_formats:
 432                                         if rf in url_map:
 433                                                 video_url_list = [(rf, url_map[rf])]
 434                                                 break
 435                                 if video_url_list is None:
 436                                         self._downloader.trouble(u'ERROR: requested format not available')
 437                                         return
 438                 else:
 439                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 440                         return
 441
 442                 results = []
 443                 for format_param, video_real_url in video_url_list:
 444                         # Extension
 445                         video_extension = self._video_extensions.get(format_param, 'flv')
 446
 447                         results.append({
 448                                 'id':           video_id.decode('utf-8'),
 449                                 'url':          video_real_url.decode('utf-8'),
 450                                 'uploader':     video_uploader.decode('utf-8'),
 451                                 'upload_date':  upload_date,
 452                                 'title':        video_title,
 453                                 'ext':          video_extension.decode('utf-8'),
 454                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 455                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 456                                 'description':  video_description,
 457                                 'player_url':   player_url,
 458                                 'subtitles':    video_subtitles
 459                         })
 460                 return results
 461
 462
 463 class MetacafeIE(InfoExtractor):
 464         """Information Extractor for metacafe.com."""
 465
 466         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 467         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 468         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 469         IE_NAME = u'metacafe'
 470
 471         def __init__(self, downloader=None):
 472                 InfoExtractor.__init__(self, downloader)
 473
 474         def report_disclaimer(self):
 475                 """Report disclaimer retrieval."""
 476                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 477
 478         def report_age_confirmation(self):
 479                 """Report attempt to confirm age."""
 480                 self._downloader.to_screen(u'[metacafe] Confirming age')
 481
 482         def report_download_webpage(self, video_id):
 483                 """Report webpage download."""
 484                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 485
 486         def report_extraction(self, video_id):
 487                 """Report information extraction."""
 488                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 489
 490         def _real_initialize(self):
 491                 # Retrieve disclaimer
 492                 request = urllib2.Request(self._DISCLAIMER)
 493                 try:
 494                         self.report_disclaimer()
 495                         disclaimer = urllib2.urlopen(request).read()
 496                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 497                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 498                         return
 499
 500                 # Confirm age
 501                 disclaimer_form = {
 502                         'filters': '0',
 503                         'submit': "Continue - I'm over 18",
 504                         }
 505                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 506                 try:
 507                         self.report_age_confirmation()
 508                         disclaimer = urllib2.urlopen(request).read()
 509                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 510                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 511                         return
 512
 513         def _real_extract(self, url):
 514                 # Extract id and simplified title from URL
 515                 mobj = re.match(self._VALID_URL, url)
 516                 if mobj is None:
 517                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 518                         return
 519
 520                 video_id = mobj.group(1)
 521
 522                 # Check if video comes from YouTube
 523                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 524                 if mobj2 is not None:
 525                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 526                         return
 527
 528                 # Retrieve video webpage to extract further information
 529                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 530                 try:
 531                         self.report_download_webpage(video_id)
 532                         webpage = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 535                         return
 536
 537                 # Extract URL, uploader and title from webpage
 538                 self.report_extraction(video_id)
 539                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 540                 if mobj is not None:
 541                         mediaURL = urllib.unquote(mobj.group(1))
 542                         video_extension = mediaURL[-3:]
 543
 544                         # Extract gdaKey if available
 545                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 546                         if mobj is None:
 547                                 video_url = mediaURL
 548                         else:
 549                                 gdaKey = mobj.group(1)
 550                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 551                 else:
 552                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 553                         if mobj is None:
 554                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 555                                 return
 556                         vardict = parse_qs(mobj.group(1))
 557                         if 'mediaData' not in vardict:
 558                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 559                                 return
 560                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 561                         if mobj is None:
 562                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 563                                 return
 564                         mediaURL = mobj.group(1).replace('\\/', '/')
 565                         video_extension = mediaURL[-3:]
 566                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 567
 568                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 569                 if mobj is None:
 570                         self._downloader.trouble(u'ERROR: unable to extract title')
 571                         return
 572                 video_title = mobj.group(1).decode('utf-8')
 573
 574                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 575                 if mobj is None:
 576                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 577                         return
 578                 video_uploader = mobj.group(1)
 579
 580                 return [{
 581                         'id':           video_id.decode('utf-8'),
 582                         'url':          video_url.decode('utf-8'),
 583                         'uploader':     video_uploader.decode('utf-8'),
 584                         'upload_date':  u'NA',
 585                         'title':        video_title,
 586                         'ext':          video_extension.decode('utf-8'),
 587                         'format':       u'NA',
 588                         'player_url':   None,
 589                 }]
 590
 591
 592 class DailymotionIE(InfoExtractor):
 593         """Information Extractor for Dailymotion"""
 594
 595         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 596         IE_NAME = u'dailymotion'
 597
 598         def __init__(self, downloader=None):
 599                 InfoExtractor.__init__(self, downloader)
 600
 601         def report_download_webpage(self, video_id):
 602                 """Report webpage download."""
 603                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 604
 605         def report_extraction(self, video_id):
 606                 """Report information extraction."""
 607                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 608
 609         def _real_extract(self, url):
 610                 # Extract id and simplified title from URL
 611                 mobj = re.match(self._VALID_URL, url)
 612                 if mobj is None:
 613                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 614                         return
 615
 616                 video_id = mobj.group(1)
 617
 618                 video_extension = 'flv'
 619
 620                 # Retrieve video webpage to extract further information
 621                 request = urllib2.Request(url)
 622                 request.add_header('Cookie', 'family_filter=off')
 623                 try:
 624                         self.report_download_webpage(video_id)
 625                         webpage = urllib2.urlopen(request).read()
 626                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 627                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 628                         return
 629
 630                 # Extract URL, uploader and title from webpage
 631                 self.report_extraction(video_id)
 632                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 633                 if mobj is None:
 634                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 635                         return
 636                 sequence = urllib.unquote(mobj.group(1))
 637                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 638                 if mobj is None:
 639                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 640                         return
 641                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 642
 643                 # if needed add http://www.dailymotion.com/ if relative URL
 644
 645                 video_url = mediaURL
 646
 647                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 648                 if mobj is None:
 649                         self._downloader.trouble(u'ERROR: unable to extract title')
 650                         return
 651                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 652
 653                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 654                 if mobj is None:
 655                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 656                         return
 657                 video_uploader = mobj.group(1)
 658
 659                 return [{
 660                         'id':           video_id.decode('utf-8'),
 661                         'url':          video_url.decode('utf-8'),
 662                         'uploader':     video_uploader.decode('utf-8'),
 663                         'upload_date':  u'NA',
 664                         'title':        video_title,
 665                         'ext':          video_extension.decode('utf-8'),
 666                         'format':       u'NA',
 667                         'player_url':   None,
 668                 }]
 669
 670
 671 class GoogleIE(InfoExtractor):
 672         """Information extractor for video.google.com."""
 673
 674         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 675         IE_NAME = u'video.google'
 676
 677         def __init__(self, downloader=None):
 678                 InfoExtractor.__init__(self, downloader)
 679
 680         def report_download_webpage(self, video_id):
 681                 """Report webpage download."""
 682                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 683
 684         def report_extraction(self, video_id):
 685                 """Report information extraction."""
 686                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 687
 688         def _real_extract(self, url):
 689                 # Extract id from URL
 690                 mobj = re.match(self._VALID_URL, url)
 691                 if mobj is None:
 692                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 693                         return
 694
 695                 video_id = mobj.group(1)
 696
 697                 video_extension = 'mp4'
 698
 699                 # Retrieve video webpage to extract further information
 700                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 701                 try:
 702                         self.report_download_webpage(video_id)
 703                         webpage = urllib2.urlopen(request).read()
 704                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 705                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 706                         return
 707
 708                 # Extract URL, uploader, and title from webpage
 709                 self.report_extraction(video_id)
 710                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 711                 if mobj is None:
 712                         video_extension = 'flv'
 713                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 714                 if mobj is None:
 715                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 716                         return
 717                 mediaURL = urllib.unquote(mobj.group(1))
 718                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 719                 mediaURL = mediaURL.replace('\\x26', '\x26')
 720
 721                 video_url = mediaURL
 722
 723                 mobj = re.search(r'<title>(.*)</title>', webpage)
 724                 if mobj is None:
 725                         self._downloader.trouble(u'ERROR: unable to extract title')
 726                         return
 727                 video_title = mobj.group(1).decode('utf-8')
 728
 729                 # Extract video description
 730                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 731                 if mobj is None:
 732                         self._downloader.trouble(u'ERROR: unable to extract video description')
 733                         return
 734                 video_description = mobj.group(1).decode('utf-8')
 735                 if not video_description:
 736                         video_description = 'No description available.'
 737
 738                 # Extract video thumbnail
 739                 if self._downloader.params.get('forcethumbnail', False):
 740                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 741                         try:
 742                                 webpage = urllib2.urlopen(request).read()
 743                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 744                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 745                                 return
 746                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 747                         if mobj is None:
 748                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 749                                 return
 750                         video_thumbnail = mobj.group(1)
 751                 else:   # we need something to pass to process_info
 752                         video_thumbnail = ''
 753
 754                 return [{
 755                         'id':           video_id.decode('utf-8'),
 756                         'url':          video_url.decode('utf-8'),
 757                         'uploader':     u'NA',
 758                         'upload_date':  u'NA',
 759                         'title':        video_title,
 760                         'ext':          video_extension.decode('utf-8'),
 761                         'format':       u'NA',
 762                         'player_url':   None,
 763                 }]
 764
 765
 766 class PhotobucketIE(InfoExtractor):
 767         """Information extractor for photobucket.com."""
 768
 769         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 770         IE_NAME = u'photobucket'
 771
 772         def __init__(self, downloader=None):
 773                 InfoExtractor.__init__(self, downloader)
 774
 775         def report_download_webpage(self, video_id):
 776                 """Report webpage download."""
 777                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 778
 779         def report_extraction(self, video_id):
 780                 """Report information extraction."""
 781                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 782
 783         def _real_extract(self, url):
 784                 # Extract id from URL
 785                 mobj = re.match(self._VALID_URL, url)
 786                 if mobj is None:
 787                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 788                         return
 789
 790                 video_id = mobj.group(1)
 791
 792                 video_extension = 'flv'
 793
 794                 # Retrieve video webpage to extract further information
 795                 request = urllib2.Request(url)
 796                 try:
 797                         self.report_download_webpage(video_id)
 798                         webpage = urllib2.urlopen(request).read()
 799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 800                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 801                         return
 802
 803                 # Extract URL, uploader, and title from webpage
 804                 self.report_extraction(video_id)
 805                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 806                 if mobj is None:
 807                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 808                         return
 809                 mediaURL = urllib.unquote(mobj.group(1))
 810
 811                 video_url = mediaURL
 812
 813                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 814                 if mobj is None:
 815                         self._downloader.trouble(u'ERROR: unable to extract title')
 816                         return
 817                 video_title = mobj.group(1).decode('utf-8')
 818
 819                 video_uploader = mobj.group(2).decode('utf-8')
 820
 821                 return [{
 822                         'id':           video_id.decode('utf-8'),
 823                         'url':          video_url.decode('utf-8'),
 824                         'uploader':     video_uploader,
 825                         'upload_date':  u'NA',
 826                         'title':        video_title,
 827                         'ext':          video_extension.decode('utf-8'),
 828                         'format':       u'NA',
 829                         'player_url':   None,
 830                 }]
 831
 832
 833 class YahooIE(InfoExtractor):
 834         """Information extractor for video.yahoo.com."""
 835
 836         # _VALID_URL matches all Yahoo! Video URLs
 837         # _VPAGE_URL matches only the extractable '/watch/' URLs
 838         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 839         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 840         IE_NAME = u'video.yahoo'
 841
 842         def __init__(self, downloader=None):
 843                 InfoExtractor.__init__(self, downloader)
 844
 845         def report_download_webpage(self, video_id):
 846                 """Report webpage download."""
 847                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 848
 849         def report_extraction(self, video_id):
 850                 """Report information extraction."""
 851                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 852
 853         def _real_extract(self, url, new_video=True):
 854                 # Extract ID from URL
 855                 mobj = re.match(self._VALID_URL, url)
 856                 if mobj is None:
 857                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 858                         return
 859
 860                 video_id = mobj.group(2)
 861                 video_extension = 'flv'
 862
 863                 # Rewrite valid but non-extractable URLs as
 864                 # extractable English language /watch/ URLs
 865                 if re.match(self._VPAGE_URL, url) is None:
 866                         request = urllib2.Request(url)
 867                         try:
 868                                 webpage = urllib2.urlopen(request).read()
 869                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 870                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 871                                 return
 872
 873                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 874                         if mobj is None:
 875                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 876                                 return
 877                         yahoo_id = mobj.group(1)
 878
 879                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 880                         if mobj is None:
 881                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 882                                 return
 883                         yahoo_vid = mobj.group(1)
 884
 885                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 886                         return self._real_extract(url, new_video=False)
 887
 888                 # Retrieve video webpage to extract further information
 889                 request = urllib2.Request(url)
 890                 try:
 891                         self.report_download_webpage(video_id)
 892                         webpage = urllib2.urlopen(request).read()
 893                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 894                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 895                         return
 896
 897                 # Extract uploader and title from webpage
 898                 self.report_extraction(video_id)
 899                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 900                 if mobj is None:
 901                         self._downloader.trouble(u'ERROR: unable to extract video title')
 902                         return
 903                 video_title = mobj.group(1).decode('utf-8')
 904
 905                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 906                 if mobj is None:
 907                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 908                         return
 909                 video_uploader = mobj.group(1).decode('utf-8')
 910
 911                 # Extract video thumbnail
 912                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 913                 if mobj is None:
 914                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 915                         return
 916                 video_thumbnail = mobj.group(1).decode('utf-8')
 917
 918                 # Extract video description
 919                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 920                 if mobj is None:
 921                         self._downloader.trouble(u'ERROR: unable to extract video description')
 922                         return
 923                 video_description = mobj.group(1).decode('utf-8')
 924                 if not video_description:
 925                         video_description = 'No description available.'
 926
 927                 # Extract video height and width
 928                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 929                 if mobj is None:
 930                         self._downloader.trouble(u'ERROR: unable to extract video height')
 931                         return
 932                 yv_video_height = mobj.group(1)
 933
 934                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 935                 if mobj is None:
 936                         self._downloader.trouble(u'ERROR: unable to extract video width')
 937                         return
 938                 yv_video_width = mobj.group(1)
 939
 940                 # Retrieve video playlist to extract media URL
 941                 # I'm not completely sure what all these options are, but we
 942                 # seem to need most of them, otherwise the server sends a 401.
 943                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 944                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 945                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 946                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 947                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 948                 try:
 949                         self.report_download_webpage(video_id)
 950                         webpage = urllib2.urlopen(request).read()
 951                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 952                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 953                         return
 954
 955                 # Extract media URL from playlist XML
 956                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 957                 if mobj is None:
 958                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 959                         return
 960                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 961                 video_url = unescapeHTML(video_url)
 962
 963                 return [{
 964                         'id':           video_id.decode('utf-8'),
 965                         'url':          video_url,
 966                         'uploader':     video_uploader,
 967                         'upload_date':  u'NA',
 968                         'title':        video_title,
 969                         'ext':          video_extension.decode('utf-8'),
 970                         'thumbnail':    video_thumbnail.decode('utf-8'),
 971                         'description':  video_description,
 972                         'thumbnail':    video_thumbnail,
 973                         'player_url':   None,
 974                 }]
 975
 976
 977 class VimeoIE(InfoExtractor):
 978         """Information extractor for vimeo.com."""
 979
 980         # _VALID_URL matches Vimeo URLs
 981         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 982         IE_NAME = u'vimeo'
 983
 984         def __init__(self, downloader=None):
 985                 InfoExtractor.__init__(self, downloader)
 986
 987         def report_download_webpage(self, video_id):
 988                 """Report webpage download."""
 989                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 990
 991         def report_extraction(self, video_id):
 992                 """Report information extraction."""
 993                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 994
 995         def _real_extract(self, url, new_video=True):
 996                 # Extract ID from URL
 997                 mobj = re.match(self._VALID_URL, url)
 998                 if mobj is None:
 999                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000                         return
1001
1002                 video_id = mobj.group(1)
1003
1004                 # Retrieve video webpage to extract further information
1005                 request = urllib2.Request(url, None, std_headers)
1006                 try:
1007                         self.report_download_webpage(video_id)
1008                         webpage = urllib2.urlopen(request).read()
1009                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1011                         return
1012
1013                 # Now we begin extracting as much information as we can from what we
1014                 # retrieved. First we extract the information common to all extractors,
1015                 # and latter we extract those that are Vimeo specific.
1016                 self.report_extraction(video_id)
1017
1018                 # Extract the config JSON
1019                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020                 try:
1021                         config = json.loads(config)
1022                 except:
1023                         self._downloader.trouble(u'ERROR: unable to extract info section')
1024                         return
1025
1026                 # Extract title
1027                 video_title = config["video"]["title"]
1028
1029                 # Extract uploader
1030                 video_uploader = config["video"]["owner"]["name"]
1031
1032                 # Extract video thumbnail
1033                 video_thumbnail = config["video"]["thumbnail"]
1034
1035                 # Extract video description
1036                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037                 if video_description: video_description = clean_html(video_description)
1038                 else: video_description = ''
1039
1040                 # Extract upload date
1041                 video_upload_date = u'NA'
1042                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043                 if mobj is not None:
1044                         video_upload_date = mobj.group(1)
1045
1046                 # Vimeo specific: extract request signature and timestamp
1047                 sig = config['request']['signature']
1048                 timestamp = config['request']['timestamp']
1049
1050                 # Vimeo specific: extract video codec and quality information
1051                 # TODO bind to format param
1052                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053                 for codec in codecs:
1054                         if codec[0] in config["video"]["files"]:
1055                                 video_codec = codec[0]
1056                                 video_extension = codec[1]
1057                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058                                 else: quality = 'sd'
1059                                 break
1060                 else:
1061                         self._downloader.trouble(u'ERROR: no known codec found')
1062                         return
1063
1064                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1066
1067                 return [{
1068                         'id':           video_id,
1069                         'url':          video_url,
1070                         'uploader':     video_uploader,
1071                         'upload_date':  video_upload_date,
1072                         'title':        video_title,
1073                         'ext':          video_extension,
1074                         'thumbnail':    video_thumbnail,
1075                         'description':  video_description,
1076                         'player_url':   None,
1077                 }]
1078
1079
1080 class GenericIE(InfoExtractor):
1081         """Generic last-resort information extractor."""
1082
1083         _VALID_URL = r'.*'
1084         IE_NAME = u'generic'
1085
1086         def __init__(self, downloader=None):
1087                 InfoExtractor.__init__(self, downloader)
1088
1089         def report_download_webpage(self, video_id):
1090                 """Report webpage download."""
1091                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1092                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1093
1094         def report_extraction(self, video_id):
1095                 """Report information extraction."""
1096                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1097
1098         def report_following_redirect(self, new_url):
1099                 """Report information extraction."""
1100                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1101
1102         def _test_redirect(self, url):
1103                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1104                 class HeadRequest(urllib2.Request):
1105                         def get_method(self):
1106                                 return "HEAD"
1107
1108                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1109                         """
1110                         Subclass the HTTPRedirectHandler to make it use our
1111                         HeadRequest also on the redirected URL
1112                         """
1113                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1114                                 if code in (301, 302, 303, 307):
1115                                         newurl = newurl.replace(' ', '%20')
1116                                         newheaders = dict((k,v) for k,v in req.headers.items()
1117                                                                           if k.lower() not in ("content-length", "content-type"))
1118                                         return HeadRequest(newurl,
1119                                                                            headers=newheaders,
1120                                                                            origin_req_host=req.get_origin_req_host(),
1121                                                                            unverifiable=True)
1122                                 else:
1123                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1124
1125                 class HTTPMethodFallback(urllib2.BaseHandler):
1126                         """
1127                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1128                         """
1129                         def http_error_405(self, req, fp, code, msg, headers):
1130                                 fp.read()
1131                                 fp.close()
1132
1133                                 newheaders = dict((k,v) for k,v in req.headers.items()
1134                                                                   if k.lower() not in ("content-length", "content-type"))
1135                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1136                                                                                                  headers=newheaders,
1137                                                                                                  origin_req_host=req.get_origin_req_host(),
1138                                                                                                  unverifiable=True))
1139
1140                 # Build our opener
1141                 opener = urllib2.OpenerDirector()
1142                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1143                                                 HTTPMethodFallback, HEADRedirectHandler,
1144                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1145                         opener.add_handler(handler())
1146
1147                 response = opener.open(HeadRequest(url))
1148                 new_url = response.geturl()
1149
1150                 if url == new_url: return False
1151
1152                 self.report_following_redirect(new_url)
1153                 self._downloader.download([new_url])
1154                 return True
1155
1156         def _real_extract(self, url):
1157                 if self._test_redirect(url): return
1158
1159                 video_id = url.split('/')[-1]
1160                 request = urllib2.Request(url)
1161                 try:
1162                         self.report_download_webpage(video_id)
1163                         webpage = urllib2.urlopen(request).read()
1164                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1166                         return
1167                 except ValueError, err:
1168                         # since this is the last-resort InfoExtractor, if
1169                         # this error is thrown, it'll be thrown here
1170                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1171                         return
1172
1173                 self.report_extraction(video_id)
1174                 # Start with something easy: JW Player in SWFObject
1175                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1176                 if mobj is None:
1177                         # Broaden the search a little bit
1178                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1179                 if mobj is None:
1180                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181                         return
1182
1183                 # It's possible that one of the regexes
1184                 # matched, but returned an empty group:
1185                 if mobj.group(1) is None:
1186                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187                         return
1188
1189                 video_url = urllib.unquote(mobj.group(1))
1190                 video_id = os.path.basename(video_url)
1191
1192                 # here's a fun little line of code for you:
1193                 video_extension = os.path.splitext(video_id)[1][1:]
1194                 video_id = os.path.splitext(video_id)[0]
1195
1196                 # it's tempting to parse this further, but you would
1197                 # have to take into account all the variations like
1198                 #   Video Title - Site Name
1199                 #   Site Name | Video Title
1200                 #   Video Title - Tagline | Site Name
1201                 # and so on and so forth; it's just not practical
1202                 mobj = re.search(r'<title>(.*)</title>', webpage)
1203                 if mobj is None:
1204                         self._downloader.trouble(u'ERROR: unable to extract title')
1205                         return
1206                 video_title = mobj.group(1).decode('utf-8')
1207
1208                 # video uploader is domain name
1209                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: unable to extract title')
1212                         return
1213                 video_uploader = mobj.group(1).decode('utf-8')
1214
1215                 return [{
1216                         'id':           video_id.decode('utf-8'),
1217                         'url':          video_url.decode('utf-8'),
1218                         'uploader':     video_uploader,
1219                         'upload_date':  u'NA',
1220                         'title':        video_title,
1221                         'ext':          video_extension.decode('utf-8'),
1222                         'format':       u'NA',
1223                         'player_url':   None,
1224                 }]
1225
1226
1227 class YoutubeSearchIE(InfoExtractor):
1228         """Information Extractor for YouTube search queries."""
1229         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1230         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1231         _max_youtube_results = 1000
1232         IE_NAME = u'youtube:search'
1233
1234         def __init__(self, downloader=None):
1235                 InfoExtractor.__init__(self, downloader)
1236
1237         def report_download_page(self, query, pagenum):
1238                 """Report attempt to download search page with given number."""
1239                 query = query.decode(preferredencoding())
1240                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1241
1242         def _real_extract(self, query):
1243                 mobj = re.match(self._VALID_URL, query)
1244                 if mobj is None:
1245                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1246                         return
1247
1248                 prefix, query = query.split(':')
1249                 prefix = prefix[8:]
1250                 query = query.encode('utf-8')
1251                 if prefix == '':
1252                         self._download_n_results(query, 1)
1253                         return
1254                 elif prefix == 'all':
1255                         self._download_n_results(query, self._max_youtube_results)
1256                         return
1257                 else:
1258                         try:
1259                                 n = long(prefix)
1260                                 if n <= 0:
1261                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1262                                         return
1263                                 elif n > self._max_youtube_results:
1264                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1265                                         n = self._max_youtube_results
1266                                 self._download_n_results(query, n)
1267                                 return
1268                         except ValueError: # parsing prefix as integer fails
1269                                 self._download_n_results(query, 1)
1270                                 return
1271
1272         def _download_n_results(self, query, n):
1273                 """Downloads a specified number of results for a query"""
1274
1275                 video_ids = []
1276                 pagenum = 0
1277                 limit = n
1278
1279                 while (50 * pagenum) < limit:
1280                         self.report_download_page(query, pagenum+1)
1281                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1282                         request = urllib2.Request(result_url)
1283                         try:
1284                                 data = urllib2.urlopen(request).read()
1285                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1287                                 return
1288                         api_response = json.loads(data)['data']
1289
1290                         new_ids = list(video['id'] for video in api_response['items'])
1291                         video_ids += new_ids
1292
1293                         limit = min(n, api_response['totalItems'])
1294                         pagenum += 1
1295
1296                 if len(video_ids) > n:
1297                         video_ids = video_ids[:n]
1298                 for id in video_ids:
1299                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1300                 return
1301
1302
1303 class GoogleSearchIE(InfoExtractor):
1304         """Information Extractor for Google Video search queries."""
1305         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1306         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1307         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1308         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1309         _max_google_results = 1000
1310         IE_NAME = u'video.google:search'
1311
1312         def __init__(self, downloader=None):
1313                 InfoExtractor.__init__(self, downloader)
1314
1315         def report_download_page(self, query, pagenum):
1316                 """Report attempt to download playlist page with given number."""
1317                 query = query.decode(preferredencoding())
1318                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1319
1320         def _real_extract(self, query):
1321                 mobj = re.match(self._VALID_URL, query)
1322                 if mobj is None:
1323                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1324                         return
1325
1326                 prefix, query = query.split(':')
1327                 prefix = prefix[8:]
1328                 query = query.encode('utf-8')
1329                 if prefix == '':
1330                         self._download_n_results(query, 1)
1331                         return
1332                 elif prefix == 'all':
1333                         self._download_n_results(query, self._max_google_results)
1334                         return
1335                 else:
1336                         try:
1337                                 n = long(prefix)
1338                                 if n <= 0:
1339                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1340                                         return
1341                                 elif n > self._max_google_results:
1342                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1343                                         n = self._max_google_results
1344                                 self._download_n_results(query, n)
1345                                 return
1346                         except ValueError: # parsing prefix as integer fails
1347                                 self._download_n_results(query, 1)
1348                                 return
1349
1350         def _download_n_results(self, query, n):
1351                 """Downloads a specified number of results for a query"""
1352
1353                 video_ids = []
1354                 pagenum = 0
1355
1356                 while True:
1357                         self.report_download_page(query, pagenum)
1358                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1359                         request = urllib2.Request(result_url)
1360                         try:
1361                                 page = urllib2.urlopen(request).read()
1362                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1364                                 return
1365
1366                         # Extract video identifiers
1367                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1368                                 video_id = mobj.group(1)
1369                                 if video_id not in video_ids:
1370                                         video_ids.append(video_id)
1371                                         if len(video_ids) == n:
1372                                                 # Specified n videos reached
1373                                                 for id in video_ids:
1374                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1375                                                 return
1376
1377                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1378                                 for id in video_ids:
1379                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1380                                 return
1381
1382                         pagenum = pagenum + 1
1383
1384
1385 class YahooSearchIE(InfoExtractor):
1386         """Information Extractor for Yahoo! Video search queries."""
1387         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1388         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1389         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1390         _MORE_PAGES_INDICATOR = r'\s*Next'
1391         _max_yahoo_results = 1000
1392         IE_NAME = u'video.yahoo:search'
1393
1394         def __init__(self, downloader=None):
1395                 InfoExtractor.__init__(self, downloader)
1396
1397         def report_download_page(self, query, pagenum):
1398                 """Report attempt to download playlist page with given number."""
1399                 query = query.decode(preferredencoding())
1400                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1401
1402         def _real_extract(self, query):
1403                 mobj = re.match(self._VALID_URL, query)
1404                 if mobj is None:
1405                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1406                         return
1407
1408                 prefix, query = query.split(':')
1409                 prefix = prefix[8:]
1410                 query = query.encode('utf-8')
1411                 if prefix == '':
1412                         self._download_n_results(query, 1)
1413                         return
1414                 elif prefix == 'all':
1415                         self._download_n_results(query, self._max_yahoo_results)
1416                         return
1417                 else:
1418                         try:
1419                                 n = long(prefix)
1420                                 if n <= 0:
1421                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1422                                         return
1423                                 elif n > self._max_yahoo_results:
1424                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1425                                         n = self._max_yahoo_results
1426                                 self._download_n_results(query, n)
1427                                 return
1428                         except ValueError: # parsing prefix as integer fails
1429                                 self._download_n_results(query, 1)
1430                                 return
1431
1432         def _download_n_results(self, query, n):
1433                 """Downloads a specified number of results for a query"""
1434
1435                 video_ids = []
1436                 already_seen = set()
1437                 pagenum = 1
1438
1439                 while True:
1440                         self.report_download_page(query, pagenum)
1441                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1442                         request = urllib2.Request(result_url)
1443                         try:
1444                                 page = urllib2.urlopen(request).read()
1445                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1447                                 return
1448
1449                         # Extract video identifiers
1450                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1451                                 video_id = mobj.group(1)
1452                                 if video_id not in already_seen:
1453                                         video_ids.append(video_id)
1454                                         already_seen.add(video_id)
1455                                         if len(video_ids) == n:
1456                                                 # Specified n videos reached
1457                                                 for id in video_ids:
1458                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1459                                                 return
1460
1461                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1462                                 for id in video_ids:
1463                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1464                                 return
1465
1466                         pagenum = pagenum + 1
1467
1468
1469 class YoutubePlaylistIE(InfoExtractor):
1470         """Information Extractor for YouTube playlists."""
1471
1472         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1473         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1474         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1475         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1476         IE_NAME = u'youtube:playlist'
1477
1478         def __init__(self, downloader=None):
1479                 InfoExtractor.__init__(self, downloader)
1480
1481         def report_download_page(self, playlist_id, pagenum):
1482                 """Report attempt to download playlist page with given number."""
1483                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1484
1485         def _real_extract(self, url):
1486                 # Extract playlist id
1487                 mobj = re.match(self._VALID_URL, url)
1488                 if mobj is None:
1489                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1490                         return
1491
1492                 # Single video case
1493                 if mobj.group(3) is not None:
1494                         self._downloader.download([mobj.group(3)])
1495                         return
1496
1497                 # Download playlist pages
1498                 # prefix is 'p' as default for playlists but there are other types that need extra care
1499                 playlist_prefix = mobj.group(1)
1500                 if playlist_prefix == 'a':
1501                         playlist_access = 'artist'
1502                 else:
1503                         playlist_prefix = 'p'
1504                         playlist_access = 'view_play_list'
1505                 playlist_id = mobj.group(2)
1506                 video_ids = []
1507                 pagenum = 1
1508
1509                 while True:
1510                         self.report_download_page(playlist_id, pagenum)
1511                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1512                         request = urllib2.Request(url)
1513                         try:
1514                                 page = urllib2.urlopen(request).read()
1515                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1516                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1517                                 return
1518
1519                         # Extract video identifiers
1520                         ids_in_page = []
1521                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1522                                 if mobj.group(1) not in ids_in_page:
1523                                         ids_in_page.append(mobj.group(1))
1524                         video_ids.extend(ids_in_page)
1525
1526                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1527                                 break
1528                         pagenum = pagenum + 1
1529
1530                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1531                 playlistend = self._downloader.params.get('playlistend', -1)
1532                 if playlistend == -1:
1533                         video_ids = video_ids[playliststart:]
1534                 else:
1535                         video_ids = video_ids[playliststart:playlistend]
1536
1537                 for id in video_ids:
1538                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1539                 return
1540
1541
1542 class YoutubeUserIE(InfoExtractor):
1543         """Information Extractor for YouTube users."""
1544
1545         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1546         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1547         _GDATA_PAGE_SIZE = 50
1548         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1549         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1550         IE_NAME = u'youtube:user'
1551
1552         def __init__(self, downloader=None):
1553                 InfoExtractor.__init__(self, downloader)
1554
1555         def report_download_page(self, username, start_index):
1556                 """Report attempt to download user page."""
1557                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1558                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1559
1560         def _real_extract(self, url):
1561                 # Extract username
1562                 mobj = re.match(self._VALID_URL, url)
1563                 if mobj is None:
1564                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1565                         return
1566
1567                 username = mobj.group(1)
1568
1569                 # Download video ids using YouTube Data API. Result size per
1570                 # query is limited (currently to 50 videos) so we need to query
1571                 # page by page until there are no video ids - it means we got
1572                 # all of them.
1573
1574                 video_ids = []
1575                 pagenum = 0
1576
1577                 while True:
1578                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579                         self.report_download_page(username, start_index)
1580
1581                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1582
1583                         try:
1584                                 page = urllib2.urlopen(request).read()
1585                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1587                                 return
1588
1589                         # Extract video identifiers
1590                         ids_in_page = []
1591
1592                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593                                 if mobj.group(1) not in ids_in_page:
1594                                         ids_in_page.append(mobj.group(1))
1595
1596                         video_ids.extend(ids_in_page)
1597
1598                         # A little optimization - if current page is not
1599                         # "full", ie. does not contain PAGE_SIZE video ids then
1600                         # we can assume that this page is the last one - there
1601                         # are no more ids on further pages - no need to query
1602                         # again.
1603
1604                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1605                                 break
1606
1607                         pagenum += 1
1608
1609                 all_ids_count = len(video_ids)
1610                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1611                 playlistend = self._downloader.params.get('playlistend', -1)
1612
1613                 if playlistend == -1:
1614                         video_ids = video_ids[playliststart:]
1615                 else:
1616                         video_ids = video_ids[playliststart:playlistend]
1617
1618                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1619                                 (username, all_ids_count, len(video_ids)))
1620
1621                 for video_id in video_ids:
1622                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1623
1624
1625 class DepositFilesIE(InfoExtractor):
1626         """Information extractor for depositfiles.com"""
1627
1628         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1629         IE_NAME = u'DepositFiles'
1630
1631         def __init__(self, downloader=None):
1632                 InfoExtractor.__init__(self, downloader)
1633
1634         def report_download_webpage(self, file_id):
1635                 """Report webpage download."""
1636                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1637
1638         def report_extraction(self, file_id):
1639                 """Report information extraction."""
1640                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1641
1642         def _real_extract(self, url):
1643                 file_id = url.split('/')[-1]
1644                 # Rebuild url in english locale
1645                 url = 'http://depositfiles.com/en/files/' + file_id
1646
1647                 # Retrieve file webpage with 'Free download' button pressed
1648                 free_download_indication = { 'gateway_result' : '1' }
1649                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1650                 try:
1651                         self.report_download_webpage(file_id)
1652                         webpage = urllib2.urlopen(request).read()
1653                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1654                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1655                         return
1656
1657                 # Search for the real file URL
1658                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1659                 if (mobj is None) or (mobj.group(1) is None):
1660                         # Try to figure out reason of the error.
1661                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1662                         if (mobj is not None) and (mobj.group(1) is not None):
1663                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1664                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1665                         else:
1666                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1667                         return
1668
1669                 file_url = mobj.group(1)
1670                 file_extension = os.path.splitext(file_url)[1][1:]
1671
1672                 # Search for file title
1673                 mobj = re.search(r'<b title="(.*?)">', webpage)
1674                 if mobj is None:
1675                         self._downloader.trouble(u'ERROR: unable to extract title')
1676                         return
1677                 file_title = mobj.group(1).decode('utf-8')
1678
1679                 return [{
1680                         'id':           file_id.decode('utf-8'),
1681                         'url':          file_url.decode('utf-8'),
1682                         'uploader':     u'NA',
1683                         'upload_date':  u'NA',
1684                         'title':        file_title,
1685                         'ext':          file_extension.decode('utf-8'),
1686                         'format':       u'NA',
1687                         'player_url':   None,
1688                 }]
1689
1690
1691 class FacebookIE(InfoExtractor):
1692         """Information Extractor for Facebook"""
1693
1694         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1695         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1696         _NETRC_MACHINE = 'facebook'
1697         _available_formats = ['video', 'highqual', 'lowqual']
1698         _video_extensions = {
1699                 'video': 'mp4',
1700                 'highqual': 'mp4',
1701                 'lowqual': 'mp4',
1702         }
1703         IE_NAME = u'facebook'
1704
1705         def __init__(self, downloader=None):
1706                 InfoExtractor.__init__(self, downloader)
1707
1708         def _reporter(self, message):
1709                 """Add header and report message."""
1710                 self._downloader.to_screen(u'[facebook] %s' % message)
1711
1712         def report_login(self):
1713                 """Report attempt to log in."""
1714                 self._reporter(u'Logging in')
1715
1716         def report_video_webpage_download(self, video_id):
1717                 """Report attempt to download video webpage."""
1718                 self._reporter(u'%s: Downloading video webpage' % video_id)
1719
1720         def report_information_extraction(self, video_id):
1721                 """Report attempt to extract video information."""
1722                 self._reporter(u'%s: Extracting video information' % video_id)
1723
1724         def _parse_page(self, video_webpage):
1725                 """Extract video information from page"""
1726                 # General data
1727                 data = {'title': r'\("video_title", "(.*?)"\)',
1728                         'description': r'<div class="datawrap">(.*?)</div>',
1729                         'owner': r'\("video_owner_name", "(.*?)"\)',
1730                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1731                         }
1732                 video_info = {}
1733                 for piece in data.keys():
1734                         mobj = re.search(data[piece], video_webpage)
1735                         if mobj is not None:
1736                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1737
1738                 # Video urls
1739                 video_urls = {}
1740                 for fmt in self._available_formats:
1741                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1742                         if mobj is not None:
1743                                 # URL is in a Javascript segment inside an escaped Unicode format within
1744                                 # the generally utf-8 page
1745                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1746                 video_info['video_urls'] = video_urls
1747
1748                 return video_info
1749
1750         def _real_initialize(self):
1751                 if self._downloader is None:
1752                         return
1753
1754                 useremail = None
1755                 password = None
1756                 downloader_params = self._downloader.params
1757
1758                 # Attempt to use provided username and password or .netrc data
1759                 if downloader_params.get('username', None) is not None:
1760                         useremail = downloader_params['username']
1761                         password = downloader_params['password']
1762                 elif downloader_params.get('usenetrc', False):
1763                         try:
1764                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1765                                 if info is not None:
1766                                         useremail = info[0]
1767                                         password = info[2]
1768                                 else:
1769                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1770                         except (IOError, netrc.NetrcParseError), err:
1771                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1772                                 return
1773
1774                 if useremail is None:
1775                         return
1776
1777                 # Log in
1778                 login_form = {
1779                         'email': useremail,
1780                         'pass': password,
1781                         'login': 'Log+In'
1782                         }
1783                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1784                 try:
1785                         self.report_login()
1786                         login_results = urllib2.urlopen(request).read()
1787                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1788                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1789                                 return
1790                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1791                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1792                         return
1793
1794         def _real_extract(self, url):
1795                 mobj = re.match(self._VALID_URL, url)
1796                 if mobj is None:
1797                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1798                         return
1799                 video_id = mobj.group('ID')
1800
1801                 # Get video webpage
1802                 self.report_video_webpage_download(video_id)
1803                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1804                 try:
1805                         page = urllib2.urlopen(request)
1806                         video_webpage = page.read()
1807                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1808                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1809                         return
1810
1811                 # Start extracting information
1812                 self.report_information_extraction(video_id)
1813
1814                 # Extract information
1815                 video_info = self._parse_page(video_webpage)
1816
1817                 # uploader
1818                 if 'owner' not in video_info:
1819                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1820                         return
1821                 video_uploader = video_info['owner']
1822
1823                 # title
1824                 if 'title' not in video_info:
1825                         self._downloader.trouble(u'ERROR: unable to extract video title')
1826                         return
1827                 video_title = video_info['title']
1828                 video_title = video_title.decode('utf-8')
1829
1830                 # thumbnail image
1831                 if 'thumbnail' not in video_info:
1832                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1833                         video_thumbnail = ''
1834                 else:
1835                         video_thumbnail = video_info['thumbnail']
1836
1837                 # upload date
1838                 upload_date = u'NA'
1839                 if 'upload_date' in video_info:
1840                         upload_time = video_info['upload_date']
1841                         timetuple = email.utils.parsedate_tz(upload_time)
1842                         if timetuple is not None:
1843                                 try:
1844                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1845                                 except:
1846                                         pass
1847
1848                 # description
1849                 video_description = video_info.get('description', 'No description available.')
1850
1851                 url_map = video_info['video_urls']
1852                 if len(url_map.keys()) > 0:
1853                         # Decide which formats to download
1854                         req_format = self._downloader.params.get('format', None)
1855                         format_limit = self._downloader.params.get('format_limit', None)
1856
1857                         if format_limit is not None and format_limit in self._available_formats:
1858                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1859                         else:
1860                                 format_list = self._available_formats
1861                         existing_formats = [x for x in format_list if x in url_map]
1862                         if len(existing_formats) == 0:
1863                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1864                                 return
1865                         if req_format is None:
1866                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1867                         elif req_format == 'worst':
1868                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1869                         elif req_format == '-1':
1870                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1871                         else:
1872                                 # Specific format
1873                                 if req_format not in url_map:
1874                                         self._downloader.trouble(u'ERROR: requested format not available')
1875                                         return
1876                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1877
1878                 results = []
1879                 for format_param, video_real_url in video_url_list:
1880                         # Extension
1881                         video_extension = self._video_extensions.get(format_param, 'mp4')
1882
1883                         results.append({
1884                                 'id':           video_id.decode('utf-8'),
1885                                 'url':          video_real_url.decode('utf-8'),
1886                                 'uploader':     video_uploader.decode('utf-8'),
1887                                 'upload_date':  upload_date,
1888                                 'title':        video_title,
1889                                 'ext':          video_extension.decode('utf-8'),
1890                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1891                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1892                                 'description':  video_description.decode('utf-8'),
1893                                 'player_url':   None,
1894                         })
1895                 return results
1896
1897 class BlipTVIE(InfoExtractor):
1898         """Information extractor for blip.tv"""
1899
1900         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1901         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1902         IE_NAME = u'blip.tv'
1903
1904         def report_extraction(self, file_id):
1905                 """Report information extraction."""
1906                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1907
1908         def report_direct_download(self, title):
1909                 """Report information extraction."""
1910                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1911
1912         def _real_extract(self, url):
1913                 mobj = re.match(self._VALID_URL, url)
1914                 if mobj is None:
1915                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1916                         return
1917
1918                 if '?' in url:
1919                         cchar = '&'
1920                 else:
1921                         cchar = '?'
1922                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1923                 request = urllib2.Request(json_url)
1924                 self.report_extraction(mobj.group(1))
1925                 info = None
1926                 try:
1927                         urlh = urllib2.urlopen(request)
1928                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1929                                 basename = url.split('/')[-1]
1930                                 title,ext = os.path.splitext(basename)
1931                                 title = title.decode('UTF-8')
1932                                 ext = ext.replace('.', '')
1933                                 self.report_direct_download(title)
1934                                 info = {
1935                                         'id': title,
1936                                         'url': url,
1937                                         'title': title,
1938                                         'ext': ext,
1939                                         'urlhandle': urlh
1940                                 }
1941                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1943                         return
1944                 if info is None: # Regular URL
1945                         try:
1946                                 json_code = urlh.read()
1947                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1949                                 return
1950
1951                         try:
1952                                 json_data = json.loads(json_code)
1953                                 if 'Post' in json_data:
1954                                         data = json_data['Post']
1955                                 else:
1956                                         data = json_data
1957
1958                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1959                                 video_url = data['media']['url']
1960                                 umobj = re.match(self._URL_EXT, video_url)
1961                                 if umobj is None:
1962                                         raise ValueError('Can not determine filename extension')
1963                                 ext = umobj.group(1)
1964
1965                                 info = {
1966                                         'id': data['item_id'],
1967                                         'url': video_url,
1968                                         'uploader': data['display_name'],
1969                                         'upload_date': upload_date,
1970                                         'title': data['title'],
1971                                         'ext': ext,
1972                                         'format': data['media']['mimeType'],
1973                                         'thumbnail': data['thumbnailUrl'],
1974                                         'description': data['description'],
1975                                         'player_url': data['embedUrl']
1976                                 }
1977                         except (ValueError,KeyError), err:
1978                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1979                                 return
1980
1981                 return [info]
1982
1983
1984 class MyVideoIE(InfoExtractor):
1985         """Information Extractor for myvideo.de."""
1986
1987         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1988         IE_NAME = u'myvideo'
1989
1990         def __init__(self, downloader=None):
1991                 InfoExtractor.__init__(self, downloader)
1992
1993         def report_download_webpage(self, video_id):
1994                 """Report webpage download."""
1995                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1996
1997         def report_extraction(self, video_id):
1998                 """Report information extraction."""
1999                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2000
2001         def _real_extract(self,url):
2002                 mobj = re.match(self._VALID_URL, url)
2003                 if mobj is None:
2004                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2005                         return
2006
2007                 video_id = mobj.group(1)
2008
2009                 # Get video webpage
2010                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2011                 try:
2012                         self.report_download_webpage(video_id)
2013                         webpage = urllib2.urlopen(request).read()
2014                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2015                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2016                         return
2017
2018                 self.report_extraction(video_id)
2019                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2020                                  webpage)
2021                 if mobj is None:
2022                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2023                         return
2024                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2025
2026                 mobj = re.search('<title>([^<]+)</title>', webpage)
2027                 if mobj is None:
2028                         self._downloader.trouble(u'ERROR: unable to extract title')
2029                         return
2030
2031                 video_title = mobj.group(1)
2032
2033                 return [{
2034                         'id':           video_id,
2035                         'url':          video_url,
2036                         'uploader':     u'NA',
2037                         'upload_date':  u'NA',
2038                         'title':        video_title,
2039                         'ext':          u'flv',
2040                         'format':       u'NA',
2041                         'player_url':   None,
2042                 }]
2043
2044 class ComedyCentralIE(InfoExtractor):
2045         """Information extractor for The Daily Show and Colbert Report """
2046
2047         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2048         IE_NAME = u'comedycentral'
2049
2050         def report_extraction(self, episode_id):
2051                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2052
2053         def report_config_download(self, episode_id):
2054                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2055
2056         def report_index_download(self, episode_id):
2057                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2058
2059         def report_player_url(self, episode_id):
2060                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2061
2062         def _real_extract(self, url):
2063                 mobj = re.match(self._VALID_URL, url)
2064                 if mobj is None:
2065                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2066                         return
2067
2068                 if mobj.group('shortname'):
2069                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2070                                 url = u'http://www.thedailyshow.com/full-episodes/'
2071                         else:
2072                                 url = u'http://www.colbertnation.com/full-episodes/'
2073                         mobj = re.match(self._VALID_URL, url)
2074                         assert mobj is not None
2075
2076                 dlNewest = not mobj.group('episode')
2077                 if dlNewest:
2078                         epTitle = mobj.group('showname')
2079                 else:
2080                         epTitle = mobj.group('episode')
2081
2082                 req = urllib2.Request(url)
2083                 self.report_extraction(epTitle)
2084                 try:
2085                         htmlHandle = urllib2.urlopen(req)
2086                         html = htmlHandle.read()
2087                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2088                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2089                         return
2090                 if dlNewest:
2091                         url = htmlHandle.geturl()
2092                         mobj = re.match(self._VALID_URL, url)
2093                         if mobj is None:
2094                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2095                                 return
2096                         if mobj.group('episode') == '':
2097                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2098                                 return
2099                         epTitle = mobj.group('episode')
2100
2101                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2102                 if len(mMovieParams) == 0:
2103                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2104                         return
2105
2106                 playerUrl_raw = mMovieParams[0][0]
2107                 self.report_player_url(epTitle)
2108                 try:
2109                         urlHandle = urllib2.urlopen(playerUrl_raw)
2110                         playerUrl = urlHandle.geturl()
2111                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2112                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2113                         return
2114
2115                 uri = mMovieParams[0][1]
2116                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2117                 self.report_index_download(epTitle)
2118                 try:
2119                         indexXml = urllib2.urlopen(indexUrl).read()
2120                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2121                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2122                         return
2123
2124                 results = []
2125
2126                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2127                 itemEls = idoc.findall('.//item')
2128                 for itemEl in itemEls:
2129                         mediaId = itemEl.findall('./guid')[0].text
2130                         shortMediaId = mediaId.split(':')[-1]
2131                         showId = mediaId.split(':')[-2].replace('.com', '')
2132                         officialTitle = itemEl.findall('./title')[0].text
2133                         officialDate = itemEl.findall('./pubDate')[0].text
2134
2135                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2136                                                 urllib.urlencode({'uri': mediaId}))
2137                         configReq = urllib2.Request(configUrl)
2138                         self.report_config_download(epTitle)
2139                         try:
2140                                 configXml = urllib2.urlopen(configReq).read()
2141                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2143                                 return
2144
2145                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2146                         turls = []
2147                         for rendition in cdoc.findall('.//rendition'):
2148                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2149                                 turls.append(finfo)
2150
2151                         if len(turls) == 0:
2152                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2153                                 continue
2154
2155                         # For now, just pick the highest bitrate
2156                         format,video_url = turls[-1]
2157
2158                         effTitle = showId + u'-' + epTitle
2159                         info = {
2160                                 'id': shortMediaId,
2161                                 'url': video_url,
2162                                 'uploader': showId,
2163                                 'upload_date': officialDate,
2164                                 'title': effTitle,
2165                                 'ext': 'mp4',
2166                                 'format': format,
2167                                 'thumbnail': None,
2168                                 'description': officialTitle,
2169                                 'player_url': playerUrl
2170                         }
2171
2172                         results.append(info)
2173
2174                 return results
2175
2176
2177 class EscapistIE(InfoExtractor):
2178         """Information extractor for The Escapist """
2179
2180         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2181         IE_NAME = u'escapist'
2182
2183         def report_extraction(self, showName):
2184                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2185
2186         def report_config_download(self, showName):
2187                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2188
2189         def _real_extract(self, url):
2190                 mobj = re.match(self._VALID_URL, url)
2191                 if mobj is None:
2192                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2193                         return
2194                 showName = mobj.group('showname')
2195                 videoId = mobj.group('episode')
2196
2197                 self.report_extraction(showName)
2198                 try:
2199                         webPage = urllib2.urlopen(url)
2200                         webPageBytes = webPage.read()
2201                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2202                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2203                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2204                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2205                         return
2206
2207                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2208                 description = unescapeHTML(descMatch.group(1))
2209                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2210                 imgUrl = unescapeHTML(imgMatch.group(1))
2211                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2212                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2213                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2214                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2215
2216                 self.report_config_download(showName)
2217                 try:
2218                         configJSON = urllib2.urlopen(configUrl).read()
2219                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2220                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2221                         return
2222
2223                 # Technically, it's JavaScript, not JSON
2224                 configJSON = configJSON.replace("'", '"')
2225
2226                 try:
2227                         config = json.loads(configJSON)
2228                 except (ValueError,), err:
2229                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2230                         return
2231
2232                 playlist = config['playlist']
2233                 videoUrl = playlist[1]['url']
2234
2235                 info = {
2236                         'id': videoId,
2237                         'url': videoUrl,
2238                         'uploader': showName,
2239                         'upload_date': None,
2240                         'title': showName,
2241                         'ext': 'flv',
2242                         'format': 'flv',
2243                         'thumbnail': imgUrl,
2244                         'description': description,
2245                         'player_url': playerUrl,
2246                 }
2247
2248                 return [info]
2249
2250
2251 class CollegeHumorIE(InfoExtractor):
2252         """Information extractor for collegehumor.com"""
2253
2254         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2255         IE_NAME = u'collegehumor'
2256
2257         def report_webpage(self, video_id):
2258                 """Report information extraction."""
2259                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2260
2261         def report_extraction(self, video_id):
2262                 """Report information extraction."""
2263                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2264
2265         def _real_extract(self, url):
2266                 mobj = re.match(self._VALID_URL, url)
2267                 if mobj is None:
2268                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2269                         return
2270                 video_id = mobj.group('videoid')
2271
2272                 self.report_webpage(video_id)
2273                 request = urllib2.Request(url)
2274                 try:
2275                         webpage = urllib2.urlopen(request).read()
2276                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2277                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2278                         return
2279
2280                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2281                 if m is None:
2282                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2283                         return
2284                 internal_video_id = m.group('internalvideoid')
2285
2286                 info = {
2287                         'id': video_id,
2288                         'internal_id': internal_video_id,
2289                 }
2290
2291                 self.report_extraction(video_id)
2292                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2293                 try:
2294                         metaXml = urllib2.urlopen(xmlUrl).read()
2295                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2296                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2297                         return
2298
2299                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2300                 try:
2301                         videoNode = mdoc.findall('./video')[0]
2302                         info['description'] = videoNode.findall('./description')[0].text
2303                         info['title'] = videoNode.findall('./caption')[0].text
2304                         info['url'] = videoNode.findall('./file')[0].text
2305                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2306                         info['ext'] = info['url'].rpartition('.')[2]
2307                         info['format'] = info['ext']
2308                 except IndexError:
2309                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2310                         return
2311
2312                 return [info]
2313
2314
2315 class XVideosIE(InfoExtractor):
2316         """Information extractor for xvideos.com"""
2317
2318         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2319         IE_NAME = u'xvideos'
2320
2321         def report_webpage(self, video_id):
2322                 """Report information extraction."""
2323                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2324
2325         def report_extraction(self, video_id):
2326                 """Report information extraction."""
2327                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2328
2329         def _real_extract(self, url):
2330                 mobj = re.match(self._VALID_URL, url)
2331                 if mobj is None:
2332                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2333                         return
2334                 video_id = mobj.group(1).decode('utf-8')
2335
2336                 self.report_webpage(video_id)
2337
2338                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2339                 try:
2340                         webpage = urllib2.urlopen(request).read()
2341                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2342                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2343                         return
2344
2345                 self.report_extraction(video_id)
2346
2347
2348                 # Extract video URL
2349                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2350                 if mobj is None:
2351                         self._downloader.trouble(u'ERROR: unable to extract video url')
2352                         return
2353                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2354
2355
2356                 # Extract title
2357                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2358                 if mobj is None:
2359                         self._downloader.trouble(u'ERROR: unable to extract video title')
2360                         return
2361                 video_title = mobj.group(1).decode('utf-8')
2362
2363
2364                 # Extract video thumbnail
2365                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2366                 if mobj is None:
2367                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2368                         return
2369                 video_thumbnail = mobj.group(0).decode('utf-8')
2370
2371                 info = {
2372                         'id': video_id,
2373                         'url': video_url,
2374                         'uploader': None,
2375                         'upload_date': None,
2376                         'title': video_title,
2377                         'ext': 'flv',
2378                         'format': 'flv',
2379                         'thumbnail': video_thumbnail,
2380                         'description': None,
2381                         'player_url': None,
2382                 }
2383
2384                 return [info]
2385
2386
2387 class SoundcloudIE(InfoExtractor):
2388         """Information extractor for soundcloud.com
2389            To access the media, the uid of the song and a stream token
2390            must be extracted from the page source and the script must make
2391            a request to media.soundcloud.com/crossdomain.xml. Then
2392            the media can be grabbed by requesting from an url composed
2393            of the stream token and uid
2394          """
2395
2396         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2397         IE_NAME = u'soundcloud'
2398
2399         def __init__(self, downloader=None):
2400                 InfoExtractor.__init__(self, downloader)
2401
2402         def report_webpage(self, video_id):
2403                 """Report information extraction."""
2404                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2405
2406         def report_extraction(self, video_id):
2407                 """Report information extraction."""
2408                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2409
2410         def _real_extract(self, url):
2411                 mobj = re.match(self._VALID_URL, url)
2412                 if mobj is None:
2413                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2414                         return
2415
2416                 # extract uploader (which is in the url)
2417                 uploader = mobj.group(1).decode('utf-8')
2418                 # extract simple title (uploader + slug of song title)
2419                 slug_title =  mobj.group(2).decode('utf-8')
2420                 simple_title = uploader + u'-' + slug_title
2421
2422                 self.report_webpage('%s/%s' % (uploader, slug_title))
2423
2424                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2425                 try:
2426                         webpage = urllib2.urlopen(request).read()
2427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2428                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2429                         return
2430
2431                 self.report_extraction('%s/%s' % (uploader, slug_title))
2432
2433                 # extract uid and stream token that soundcloud hands out for access
2434                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2435                 if mobj:
2436                         video_id = mobj.group(1)
2437                         stream_token = mobj.group(2)
2438
2439                 # extract unsimplified title
2440                 mobj = re.search('"title":"(.*?)",', webpage)
2441                 if mobj:
2442                         title = mobj.group(1).decode('utf-8')
2443                 else:
2444                         title = simple_title
2445
2446                 # construct media url (with uid/token)
2447                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2448                 mediaURL = mediaURL % (video_id, stream_token)
2449
2450                 # description
2451                 description = u'No description available'
2452                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2453                 if mobj:
2454                         description = mobj.group(1)
2455
2456                 # upload date
2457                 upload_date = None
2458                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2459                 if mobj:
2460                         try:
2461                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2462                         except Exception, e:
2463                                 self._downloader.to_stderr(str(e))
2464
2465                 # for soundcloud, a request to a cross domain is required for cookies
2466                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2467
2468                 return [{
2469                         'id':           video_id.decode('utf-8'),
2470                         'url':          mediaURL,
2471                         'uploader':     uploader.decode('utf-8'),
2472                         'upload_date':  upload_date,
2473                         'title':        title,
2474                         'ext':          u'mp3',
2475                         'format':       u'NA',
2476                         'player_url':   None,
2477                         'description': description.decode('utf-8')
2478                 }]
2479
2480
2481 class InfoQIE(InfoExtractor):
2482         """Information extractor for infoq.com"""
2483
2484         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2485         IE_NAME = u'infoq'
2486
2487         def report_webpage(self, video_id):
2488                 """Report information extraction."""
2489                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2490
2491         def report_extraction(self, video_id):
2492                 """Report information extraction."""
2493                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2494
2495         def _real_extract(self, url):
2496                 mobj = re.match(self._VALID_URL, url)
2497                 if mobj is None:
2498                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2499                         return
2500
2501                 self.report_webpage(url)
2502
2503                 request = urllib2.Request(url)
2504                 try:
2505                         webpage = urllib2.urlopen(request).read()
2506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2507                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2508                         return
2509
2510                 self.report_extraction(url)
2511
2512
2513                 # Extract video URL
2514                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2515                 if mobj is None:
2516                         self._downloader.trouble(u'ERROR: unable to extract video url')
2517                         return
2518                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2519
2520
2521                 # Extract title
2522                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2523                 if mobj is None:
2524                         self._downloader.trouble(u'ERROR: unable to extract video title')
2525                         return
2526                 video_title = mobj.group(1).decode('utf-8')
2527
2528                 # Extract description
2529                 video_description = u'No description available.'
2530                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2531                 if mobj is not None:
2532                         video_description = mobj.group(1).decode('utf-8')
2533
2534                 video_filename = video_url.split('/')[-1]
2535                 video_id, extension = video_filename.split('.')
2536
2537                 info = {
2538                         'id': video_id,
2539                         'url': video_url,
2540                         'uploader': None,
2541                         'upload_date': None,
2542                         'title': video_title,
2543                         'ext': extension,
2544                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2545                         'thumbnail': None,
2546                         'description': video_description,
2547                         'player_url': None,
2548                 }
2549
2550                 return [info]
2551
2552 class MixcloudIE(InfoExtractor):
2553         """Information extractor for www.mixcloud.com"""
2554         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2555         IE_NAME = u'mixcloud'
2556
2557         def __init__(self, downloader=None):
2558                 InfoExtractor.__init__(self, downloader)
2559
2560         def report_download_json(self, file_id):
2561                 """Report JSON download."""
2562                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2563
2564         def report_extraction(self, file_id):
2565                 """Report information extraction."""
2566                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2567
2568         def get_urls(self, jsonData, fmt, bitrate='best'):
2569                 """Get urls from 'audio_formats' section in json"""
2570                 file_url = None
2571                 try:
2572                         bitrate_list = jsonData[fmt]
2573                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2574                                 bitrate = max(bitrate_list) # select highest
2575
2576                         url_list = jsonData[fmt][bitrate]
2577                 except TypeError: # we have no bitrate info.
2578                         url_list = jsonData[fmt]
2579                 return url_list
2580
2581         def check_urls(self, url_list):
2582                 """Returns 1st active url from list"""
2583                 for url in url_list:
2584                         try:
2585                                 urllib2.urlopen(url)
2586                                 return url
2587                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2588                                 url = None
2589
2590                 return None
2591
2592         def _print_formats(self, formats):
2593                 print 'Available formats:'
2594                 for fmt in formats.keys():
2595                         for b in formats[fmt]:
2596                                 try:
2597                                         ext = formats[fmt][b][0]
2598                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2599                                 except TypeError: # we have no bitrate info
2600                                         ext = formats[fmt][0]
2601                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2602                                         break
2603
2604         def _real_extract(self, url):
2605                 mobj = re.match(self._VALID_URL, url)
2606                 if mobj is None:
2607                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2608                         return
2609                 # extract uploader & filename from url
2610                 uploader = mobj.group(1).decode('utf-8')
2611                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2612
2613                 # construct API request
2614                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2615                 # retrieve .json file with links to files
2616                 request = urllib2.Request(file_url)
2617                 try:
2618                         self.report_download_json(file_url)
2619                         jsonData = urllib2.urlopen(request).read()
2620                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2621                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2622                         return
2623
2624                 # parse JSON
2625                 json_data = json.loads(jsonData)
2626                 player_url = json_data['player_swf_url']
2627                 formats = dict(json_data['audio_formats'])
2628
2629                 req_format = self._downloader.params.get('format', None)
2630                 bitrate = None
2631
2632                 if self._downloader.params.get('listformats', None):
2633                         self._print_formats(formats)
2634                         return
2635
2636                 if req_format is None or req_format == 'best':
2637                         for format_param in formats.keys():
2638                                 url_list = self.get_urls(formats, format_param)
2639                                 # check urls
2640                                 file_url = self.check_urls(url_list)
2641                                 if file_url is not None:
2642                                         break # got it!
2643                 else:
2644                         if req_format not in formats.keys():
2645                                 self._downloader.trouble(u'ERROR: format is not available')
2646                                 return
2647
2648                         url_list = self.get_urls(formats, req_format)
2649                         file_url = self.check_urls(url_list)
2650                         format_param = req_format
2651
2652                 return [{
2653                         'id': file_id.decode('utf-8'),
2654                         'url': file_url.decode('utf-8'),
2655                         'uploader':     uploader.decode('utf-8'),
2656                         'upload_date': u'NA',
2657                         'title': json_data['name'],
2658                         'ext': file_url.split('.')[-1].decode('utf-8'),
2659                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2660                         'thumbnail': json_data['thumbnail_url'],
2661                         'description': json_data['description'],
2662                         'player_url': player_url.decode('utf-8'),
2663                 }]
2664
2665 class StanfordOpenClassroomIE(InfoExtractor):
2666         """Information extractor for Stanford's Open ClassRoom"""
2667
2668         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2669         IE_NAME = u'stanfordoc'
2670
2671         def report_download_webpage(self, objid):
2672                 """Report information extraction."""
2673                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2674
2675         def report_extraction(self, video_id):
2676                 """Report information extraction."""
2677                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2678
2679         def _real_extract(self, url):
2680                 mobj = re.match(self._VALID_URL, url)
2681                 if mobj is None:
2682                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2683                         return
2684
2685                 if mobj.group('course') and mobj.group('video'): # A specific video
2686                         course = mobj.group('course')
2687                         video = mobj.group('video')
2688                         info = {
2689                                 'id': course + '_' + video,
2690                         }
2691
2692                         self.report_extraction(info['id'])
2693                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2694                         xmlUrl = baseUrl + video + '.xml'
2695                         try:
2696                                 metaXml = urllib2.urlopen(xmlUrl).read()
2697                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2698                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2699                                 return
2700                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2701                         try:
2702                                 info['title'] = mdoc.findall('./title')[0].text
2703                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2704                         except IndexError:
2705                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2706                                 return
2707                         info['ext'] = info['url'].rpartition('.')[2]
2708                         info['format'] = info['ext']
2709                         return [info]
2710                 elif mobj.group('course'): # A course page
2711                         course = mobj.group('course')
2712                         info = {
2713                                 'id': course,
2714                                 'type': 'playlist',
2715                         }
2716
2717                         self.report_download_webpage(info['id'])
2718                         try:
2719                                 coursepage = urllib2.urlopen(url).read()
2720                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2721                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2722                                 return
2723
2724                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2725                         if m:
2726                                 info['title'] = unescapeHTML(m.group(1))
2727                         else:
2728                                 info['title'] = info['id']
2729
2730                         m = re.search('<description>([^<]+)</description>', coursepage)
2731                         if m:
2732                                 info['description'] = unescapeHTML(m.group(1))
2733
2734                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2735                         info['list'] = [
2736                                 {
2737                                         'type': 'reference',
2738                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2739                                 }
2740                                         for vpage in links]
2741                         results = []
2742                         for entry in info['list']:
2743                                 assert entry['type'] == 'reference'
2744                                 results += self.extract(entry['url'])
2745                         return results
2746
2747                 else: # Root page
2748                         info = {
2749                                 'id': 'Stanford OpenClassroom',
2750                                 'type': 'playlist',
2751                         }
2752
2753                         self.report_download_webpage(info['id'])
2754                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2755                         try:
2756                                 rootpage = urllib2.urlopen(rootURL).read()
2757                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2758                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2759                                 return
2760
2761                         info['title'] = info['id']
2762
2763                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2764                         info['list'] = [
2765                                 {
2766                                         'type': 'reference',
2767                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2768                                 }
2769                                         for cpage in links]
2770
2771                         results = []
2772                         for entry in info['list']:
2773                                 assert entry['type'] == 'reference'
2774                                 results += self.extract(entry['url'])
2775                         return results
2776
2777 class MTVIE(InfoExtractor):
2778         """Information extractor for MTV.com"""
2779
2780         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2781         IE_NAME = u'mtv'
2782
2783         def report_webpage(self, video_id):
2784                 """Report information extraction."""
2785                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2786
2787         def report_extraction(self, video_id):
2788                 """Report information extraction."""
2789                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2790
2791         def _real_extract(self, url):
2792                 mobj = re.match(self._VALID_URL, url)
2793                 if mobj is None:
2794                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2795                         return
2796                 if not mobj.group('proto'):
2797                         url = 'http://' + url
2798                 video_id = mobj.group('videoid')
2799                 self.report_webpage(video_id)
2800
2801                 request = urllib2.Request(url)
2802                 try:
2803                         webpage = urllib2.urlopen(request).read()
2804                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2805                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2806                         return
2807
2808                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2809                 if mobj is None:
2810                         self._downloader.trouble(u'ERROR: unable to extract song name')
2811                         return
2812                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2813                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2814                 if mobj is None:
2815                         self._downloader.trouble(u'ERROR: unable to extract performer')
2816                         return
2817                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2818                 video_title = performer + ' - ' + song_name
2819
2820                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2821                 if mobj is None:
2822                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2823                         return
2824                 mtvn_uri = mobj.group(1)
2825
2826                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2827                 if mobj is None:
2828                         self._downloader.trouble(u'ERROR: unable to extract content id')
2829                         return
2830                 content_id = mobj.group(1)
2831
2832                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2833                 self.report_extraction(video_id)
2834                 request = urllib2.Request(videogen_url)
2835                 try:
2836                         metadataXml = urllib2.urlopen(request).read()
2837                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2839                         return
2840
2841                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2842                 renditions = mdoc.findall('.//rendition')
2843
2844                 # For now, always pick the highest quality.
2845                 rendition = renditions[-1]
2846
2847                 try:
2848                         _,_,ext = rendition.attrib['type'].partition('/')
2849                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2850                         video_url = rendition.find('./src').text
2851                 except KeyError:
2852                         self._downloader.trouble('Invalid rendition field.')
2853                         return
2854
2855                 info = {
2856                         'id': video_id,
2857                         'url': video_url,
2858                         'uploader': performer,
2859                         'title': video_title,
2860                         'ext': ext,
2861                         'format': format,
2862                 }
2863
2864                 return [info]