_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         stitle:         Simplified title.
  43         ext:            Video filename extension.
  44         format:         Video format.
  45         player_url:     SWF Player URL (may be None).
  46
  47         The following fields are optional. Their primary purpose is to allow
  48         youtube-dl to serve as the backend for a video search function, such
  49         as the one in youtube2mp3.  They are only used when their respective
  50         forced printing functions are called:
  51
  52         thumbnail:      Full URL to a video thumbnail image.
  53         description:    One-line video description.
  54
  55         Subclasses of this one should re-define the _real_initialize() and
  56         _real_extract() methods and define a _VALID_URL regexp.
  57         Probably, they should also be added to the list of extractors.
  58         """
  59
  60         _ready = False
  61         _downloader = None
  62
  63         def __init__(self, downloader=None):
  64                 """Constructor. Receives an optional downloader."""
  65                 self._ready = False
  66                 self.set_downloader(downloader)
  67
  68         def suitable(self, url):
  69                 """Receives a URL and returns True if suitable for this IE."""
  70                 return re.match(self._VALID_URL, url) is not None
  71
  72         def initialize(self):
  73                 """Initializes an instance (authentication, etc)."""
  74                 if not self._ready:
  75                         self._real_initialize()
  76                         self._ready = True
  77
  78         def extract(self, url):
  79                 """Extracts URL information and returns it in list of dicts."""
  80                 self.initialize()
  81                 return self._real_extract(url)
  82
  83         def set_downloader(self, downloader):
  84                 """Sets the downloader for this IE."""
  85                 self._downloader = downloader
  86
  87         def _real_initialize(self):
  88                 """Real initialization process. Redefine in subclasses."""
  89                 pass
  90
  91         def _real_extract(self, url):
  92                 """Real extraction process. Redefine in subclasses."""
  93                 pass
  94
  95
  96 class YoutubeIE(InfoExtractor):
  97         """Information extractor for youtube.com."""
  98
  99         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 100         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 101         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 102         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 103         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 104         _NETRC_MACHINE = 'youtube'
 105         # Listed in order of quality
 106         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 107         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 108         _video_extensions = {
 109                 '13': '3gp',
 110                 '17': 'mp4',
 111                 '18': 'mp4',
 112                 '22': 'mp4',
 113                 '37': 'mp4',
 114                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 115                 '43': 'webm',
 116                 '44': 'webm',
 117                 '45': 'webm',
 118                 '46': 'webm',
 119         }
 120         _video_dimensions = {
 121                 '5': '240x400',
 122                 '6': '???',
 123                 '13': '???',
 124                 '17': '144x176',
 125                 '18': '360x640',
 126                 '22': '720x1280',
 127                 '34': '360x640',
 128                 '35': '480x854',
 129                 '37': '1080x1920',
 130                 '38': '3072x4096',
 131                 '43': '360x640',
 132                 '44': '480x854',
 133                 '45': '720x1280',
 134                 '46': '1080x1920',
 135         }
 136         IE_NAME = u'youtube'
 137
 138         def report_lang(self):
 139                 """Report attempt to set language."""
 140                 self._downloader.to_screen(u'[youtube] Setting language')
 141
 142         def report_login(self):
 143                 """Report attempt to log in."""
 144                 self._downloader.to_screen(u'[youtube] Logging in')
 145
 146         def report_age_confirmation(self):
 147                 """Report attempt to confirm age."""
 148                 self._downloader.to_screen(u'[youtube] Confirming age')
 149
 150         def report_video_webpage_download(self, video_id):
 151                 """Report attempt to download video webpage."""
 152                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 153
 154         def report_video_info_webpage_download(self, video_id):
 155                 """Report attempt to download video info webpage."""
 156                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 157
 158         def report_video_subtitles_download(self, video_id):
 159                 """Report attempt to download video info webpage."""
 160                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 161
 162         def report_information_extraction(self, video_id):
 163                 """Report attempt to extract video information."""
 164                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 165
 166         def report_unavailable_format(self, video_id, format):
 167                 """Report extracted video URL."""
 168                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 169
 170         def report_rtmp_download(self):
 171                 """Indicate the download will use the RTMP protocol."""
 172                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 173
 174         def _closed_captions_xml_to_srt(self, xml_string):
 175                 srt = ''
 176                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 177                 # TODO parse xml instead of regex
 178                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 179                         if not dur: dur = '4'
 180                         start = float(start)
 181                         end = start + float(dur)
 182                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 183                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 184                         caption = unescapeHTML(caption)
 185                         caption = unescapeHTML(caption) # double cycle, inentional
 186                         srt += str(n) + '\n'
 187                         srt += start + ' --> ' + end + '\n'
 188                         srt += caption + '\n\n'
 189                 return srt
 190
 191         def _print_formats(self, formats):
 192                 print 'Available formats:'
 193                 for x in formats:
 194                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 195
 196         def _real_initialize(self):
 197                 if self._downloader is None:
 198                         return
 199
 200                 username = None
 201                 password = None
 202                 downloader_params = self._downloader.params
 203
 204                 # Attempt to use provided username and password or .netrc data
 205                 if downloader_params.get('username', None) is not None:
 206                         username = downloader_params['username']
 207                         password = downloader_params['password']
 208                 elif downloader_params.get('usenetrc', False):
 209                         try:
 210                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 211                                 if info is not None:
 212                                         username = info[0]
 213                                         password = info[2]
 214                                 else:
 215                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 216                         except (IOError, netrc.NetrcParseError), err:
 217                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 218                                 return
 219
 220                 # Set language
 221                 request = urllib2.Request(self._LANG_URL)
 222                 try:
 223                         self.report_lang()
 224                         urllib2.urlopen(request).read()
 225                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 226                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 227                         return
 228
 229                 # No authentication to be performed
 230                 if username is None:
 231                         return
 232
 233                 # Log in
 234                 login_form = {
 235                                 'current_form': 'loginForm',
 236                                 'next':         '/',
 237                                 'action_login': 'Log In',
 238                                 'username':     username,
 239                                 'password':     password,
 240                                 }
 241                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 242                 try:
 243                         self.report_login()
 244                         login_results = urllib2.urlopen(request).read()
 245                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 246                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 247                                 return
 248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 249                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 250                         return
 251
 252                 # Confirm age
 253                 age_form = {
 254                                 'next_url':             '/',
 255                                 'action_confirm':       'Confirm',
 256                                 }
 257                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 258                 try:
 259                         self.report_age_confirmation()
 260                         age_results = urllib2.urlopen(request).read()
 261                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 262                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 263                         return
 264
 265         def _real_extract(self, url):
 266                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 267                 mobj = re.search(self._NEXT_URL_RE, url)
 268                 if mobj:
 269                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 270
 271                 # Extract video id from URL
 272                 mobj = re.match(self._VALID_URL, url)
 273                 if mobj is None:
 274                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 275                         return
 276                 video_id = mobj.group(2)
 277
 278                 # Get video webpage
 279                 self.report_video_webpage_download(video_id)
 280                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 281                 try:
 282                         video_webpage = urllib2.urlopen(request).read()
 283                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 284                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 285                         return
 286
 287                 # Attempt to extract SWF player URL
 288                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 289                 if mobj is not None:
 290                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 291                 else:
 292                         player_url = None
 293
 294                 # Get video info
 295                 self.report_video_info_webpage_download(video_id)
 296                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 297                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 298                                         % (video_id, el_type))
 299                         request = urllib2.Request(video_info_url)
 300                         try:
 301                                 video_info_webpage = urllib2.urlopen(request).read()
 302                                 video_info = parse_qs(video_info_webpage)
 303                                 if 'token' in video_info:
 304                                         break
 305                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 306                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 307                                 return
 308                 if 'token' not in video_info:
 309                         if 'reason' in video_info:
 310                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 311                         else:
 312                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 313                         return
 314
 315                 # Start extracting information
 316                 self.report_information_extraction(video_id)
 317
 318                 # uploader
 319                 if 'author' not in video_info:
 320                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 321                         return
 322                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 323
 324                 # title
 325                 if 'title' not in video_info:
 326                         self._downloader.trouble(u'ERROR: unable to extract video title')
 327                         return
 328                 video_title = urllib.unquote_plus(video_info['title'][0])
 329                 video_title = video_title.decode('utf-8')
 330                 video_title = sanitize_title(video_title)
 331
 332                 # simplified title
 333                 simple_title = simplify_title(video_title)
 334
 335                 # thumbnail image
 336                 if 'thumbnail_url' not in video_info:
 337                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 338                         video_thumbnail = ''
 339                 else:   # don't panic if we can't find it
 340                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 341
 342                 # upload date
 343                 upload_date = u'NA'
 344                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 345                 if mobj is not None:
 346                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 347                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 348                         for expression in format_expressions:
 349                                 try:
 350                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 351                                 except:
 352                                         pass
 353
 354                 # description
 355                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 356                 if video_description: video_description = clean_html(video_description)
 357                 else: video_description = ''
 358
 359                 # closed captions
 360                 video_subtitles = None
 361                 if self._downloader.params.get('writesubtitles', False):
 362                         try:
 363                                 self.report_video_subtitles_download(video_id)
 364                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 365                                 try:
 366                                         srt_list = urllib2.urlopen(request).read()
 367                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 368                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 369                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 370                                 if not srt_lang_list:
 371                                         raise Trouble(u'WARNING: video has no closed captions')
 372                                 if self._downloader.params.get('subtitleslang', False):
 373                                         srt_lang = self._downloader.params.get('subtitleslang')
 374                                 elif 'en' in srt_lang_list:
 375                                         srt_lang = 'en'
 376                                 else:
 377                                         srt_lang = srt_lang_list[0]
 378                                 if not srt_lang in srt_lang_list:
 379                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 380                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 381                                 try:
 382                                         srt_xml = urllib2.urlopen(request).read()
 383                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 384                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 385                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 386                         except Trouble as trouble:
 387                                 self._downloader.trouble(trouble[0])
 388
 389                 # token
 390                 video_token = urllib.unquote_plus(video_info['token'][0])
 391
 392                 # Decide which formats to download
 393                 req_format = self._downloader.params.get('format', None)
 394
 395                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 396                         self.report_rtmp_download()
 397                         video_url_list = [(None, video_info['conn'][0])]
 398                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 399                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 400                         url_data = [parse_qs(uds) for uds in url_data_strs]
 401                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 402                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 403
 404                         format_limit = self._downloader.params.get('format_limit', None)
 405                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 406                         if format_limit is not None and format_limit in available_formats:
 407                                 format_list = available_formats[available_formats.index(format_limit):]
 408                         else:
 409                                 format_list = available_formats
 410                         existing_formats = [x for x in format_list if x in url_map]
 411                         if len(existing_formats) == 0:
 412                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 413                                 return
 414                         if self._downloader.params.get('listformats', None):
 415                                 self._print_formats(existing_formats)
 416                                 return
 417                         if req_format is None or req_format == 'best':
 418                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 419                         elif req_format == 'worst':
 420                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 421                         elif req_format in ('-1', 'all'):
 422                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 423                         else:
 424                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 425                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 426                                 req_formats = req_format.split('/')
 427                                 video_url_list = None
 428                                 for rf in req_formats:
 429                                         if rf in url_map:
 430                                                 video_url_list = [(rf, url_map[rf])]
 431                                                 break
 432                                 if video_url_list is None:
 433                                         self._downloader.trouble(u'ERROR: requested format not available')
 434                                         return
 435                 else:
 436                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 437                         return
 438
 439                 results = []
 440                 for format_param, video_real_url in video_url_list:
 441                         # Extension
 442                         video_extension = self._video_extensions.get(format_param, 'flv')
 443
 444                         results.append({
 445                                 'id':           video_id.decode('utf-8'),
 446                                 'url':          video_real_url.decode('utf-8'),
 447                                 'uploader':     video_uploader.decode('utf-8'),
 448                                 'upload_date':  upload_date,
 449                                 'title':        video_title,
 450                                 'stitle':       simple_title,
 451                                 'ext':          video_extension.decode('utf-8'),
 452                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 453                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 454                                 'description':  video_description,
 455                                 'player_url':   player_url,
 456                                 'subtitles':    video_subtitles
 457                         })
 458                 return results
 459
 460
 461 class MetacafeIE(InfoExtractor):
 462         """Information Extractor for metacafe.com."""
 463
 464         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 465         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 466         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 467         IE_NAME = u'metacafe'
 468
 469         def __init__(self, downloader=None):
 470                 InfoExtractor.__init__(self, downloader)
 471
 472         def report_disclaimer(self):
 473                 """Report disclaimer retrieval."""
 474                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 475
 476         def report_age_confirmation(self):
 477                 """Report attempt to confirm age."""
 478                 self._downloader.to_screen(u'[metacafe] Confirming age')
 479
 480         def report_download_webpage(self, video_id):
 481                 """Report webpage download."""
 482                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 483
 484         def report_extraction(self, video_id):
 485                 """Report information extraction."""
 486                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 487
 488         def _real_initialize(self):
 489                 # Retrieve disclaimer
 490                 request = urllib2.Request(self._DISCLAIMER)
 491                 try:
 492                         self.report_disclaimer()
 493                         disclaimer = urllib2.urlopen(request).read()
 494                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 495                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 496                         return
 497
 498                 # Confirm age
 499                 disclaimer_form = {
 500                         'filters': '0',
 501                         'submit': "Continue - I'm over 18",
 502                         }
 503                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 504                 try:
 505                         self.report_age_confirmation()
 506                         disclaimer = urllib2.urlopen(request).read()
 507                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 508                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 509                         return
 510
 511         def _real_extract(self, url):
 512                 # Extract id and simplified title from URL
 513                 mobj = re.match(self._VALID_URL, url)
 514                 if mobj is None:
 515                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 516                         return
 517
 518                 video_id = mobj.group(1)
 519
 520                 # Check if video comes from YouTube
 521                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 522                 if mobj2 is not None:
 523                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 524                         return
 525
 526                 simple_title = mobj.group(2).decode('utf-8')
 527
 528                 # Retrieve video webpage to extract further information
 529                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 530                 try:
 531                         self.report_download_webpage(video_id)
 532                         webpage = urllib2.urlopen(request).read()
 533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 534                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 535                         return
 536
 537                 # Extract URL, uploader and title from webpage
 538                 self.report_extraction(video_id)
 539                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 540                 if mobj is not None:
 541                         mediaURL = urllib.unquote(mobj.group(1))
 542                         video_extension = mediaURL[-3:]
 543
 544                         # Extract gdaKey if available
 545                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 546                         if mobj is None:
 547                                 video_url = mediaURL
 548                         else:
 549                                 gdaKey = mobj.group(1)
 550                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 551                 else:
 552                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 553                         if mobj is None:
 554                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 555                                 return
 556                         vardict = parse_qs(mobj.group(1))
 557                         if 'mediaData' not in vardict:
 558                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 559                                 return
 560                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 561                         if mobj is None:
 562                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 563                                 return
 564                         mediaURL = mobj.group(1).replace('\\/', '/')
 565                         video_extension = mediaURL[-3:]
 566                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 567
 568                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 569                 if mobj is None:
 570                         self._downloader.trouble(u'ERROR: unable to extract title')
 571                         return
 572                 video_title = mobj.group(1).decode('utf-8')
 573                 video_title = sanitize_title(video_title)
 574
 575                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 576                 if mobj is None:
 577                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 578                         return
 579                 video_uploader = mobj.group(1)
 580
 581                 return [{
 582                         'id':           video_id.decode('utf-8'),
 583                         'url':          video_url.decode('utf-8'),
 584                         'uploader':     video_uploader.decode('utf-8'),
 585                         'upload_date':  u'NA',
 586                         'title':        video_title,
 587                         'stitle':       simple_title,
 588                         'ext':          video_extension.decode('utf-8'),
 589                         'format':       u'NA',
 590                         'player_url':   None,
 591                 }]
 592
 593
 594 class DailymotionIE(InfoExtractor):
 595         """Information Extractor for Dailymotion"""
 596
 597         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 598         IE_NAME = u'dailymotion'
 599
 600         def __init__(self, downloader=None):
 601                 InfoExtractor.__init__(self, downloader)
 602
 603         def report_download_webpage(self, video_id):
 604                 """Report webpage download."""
 605                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 606
 607         def report_extraction(self, video_id):
 608                 """Report information extraction."""
 609                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 610
 611         def _real_extract(self, url):
 612                 # Extract id and simplified title from URL
 613                 mobj = re.match(self._VALID_URL, url)
 614                 if mobj is None:
 615                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 616                         return
 617
 618                 video_id = mobj.group(1)
 619
 620                 video_extension = 'flv'
 621
 622                 # Retrieve video webpage to extract further information
 623                 request = urllib2.Request(url)
 624                 request.add_header('Cookie', 'family_filter=off')
 625                 try:
 626                         self.report_download_webpage(video_id)
 627                         webpage = urllib2.urlopen(request).read()
 628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 629                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 630                         return
 631
 632                 # Extract URL, uploader and title from webpage
 633                 self.report_extraction(video_id)
 634                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 635                 if mobj is None:
 636                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                         return
 638                 sequence = urllib.unquote(mobj.group(1))
 639                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 640                 if mobj is None:
 641                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                         return
 643                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 644
 645                 # if needed add http://www.dailymotion.com/ if relative URL
 646
 647                 video_url = mediaURL
 648
 649                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 650                 if mobj is None:
 651                         self._downloader.trouble(u'ERROR: unable to extract title')
 652                         return
 653                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 654                 video_title = sanitize_title(video_title)
 655                 simple_title = simplify_title(video_title)
 656
 657                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 658                 if mobj is None:
 659                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 660                         return
 661                 video_uploader = mobj.group(1)
 662
 663                 return [{
 664                         'id':           video_id.decode('utf-8'),
 665                         'url':          video_url.decode('utf-8'),
 666                         'uploader':     video_uploader.decode('utf-8'),
 667                         'upload_date':  u'NA',
 668                         'title':        video_title,
 669                         'stitle':       simple_title,
 670                         'ext':          video_extension.decode('utf-8'),
 671                         'format':       u'NA',
 672                         'player_url':   None,
 673                 }]
 674
 675
 676 class GoogleIE(InfoExtractor):
 677         """Information extractor for video.google.com."""
 678
 679         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 680         IE_NAME = u'video.google'
 681
 682         def __init__(self, downloader=None):
 683                 InfoExtractor.__init__(self, downloader)
 684
 685         def report_download_webpage(self, video_id):
 686                 """Report webpage download."""
 687                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 688
 689         def report_extraction(self, video_id):
 690                 """Report information extraction."""
 691                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 692
 693         def _real_extract(self, url):
 694                 # Extract id from URL
 695                 mobj = re.match(self._VALID_URL, url)
 696                 if mobj is None:
 697                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 698                         return
 699
 700                 video_id = mobj.group(1)
 701
 702                 video_extension = 'mp4'
 703
 704                 # Retrieve video webpage to extract further information
 705                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 706                 try:
 707                         self.report_download_webpage(video_id)
 708                         webpage = urllib2.urlopen(request).read()
 709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 710                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 711                         return
 712
 713                 # Extract URL, uploader, and title from webpage
 714                 self.report_extraction(video_id)
 715                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 716                 if mobj is None:
 717                         video_extension = 'flv'
 718                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 719                 if mobj is None:
 720                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 721                         return
 722                 mediaURL = urllib.unquote(mobj.group(1))
 723                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 724                 mediaURL = mediaURL.replace('\\x26', '\x26')
 725
 726                 video_url = mediaURL
 727
 728                 mobj = re.search(r'<title>(.*)</title>', webpage)
 729                 if mobj is None:
 730                         self._downloader.trouble(u'ERROR: unable to extract title')
 731                         return
 732                 video_title = mobj.group(1).decode('utf-8')
 733                 video_title = sanitize_title(video_title)
 734                 simple_title = simplify_title(video_title)
 735
 736                 # Extract video description
 737                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 738                 if mobj is None:
 739                         self._downloader.trouble(u'ERROR: unable to extract video description')
 740                         return
 741                 video_description = mobj.group(1).decode('utf-8')
 742                 if not video_description:
 743                         video_description = 'No description available.'
 744
 745                 # Extract video thumbnail
 746                 if self._downloader.params.get('forcethumbnail', False):
 747                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 748                         try:
 749                                 webpage = urllib2.urlopen(request).read()
 750                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 751                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 752                                 return
 753                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 754                         if mobj is None:
 755                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 756                                 return
 757                         video_thumbnail = mobj.group(1)
 758                 else:   # we need something to pass to process_info
 759                         video_thumbnail = ''
 760
 761                 return [{
 762                         'id':           video_id.decode('utf-8'),
 763                         'url':          video_url.decode('utf-8'),
 764                         'uploader':     u'NA',
 765                         'upload_date':  u'NA',
 766                         'title':        video_title,
 767                         'stitle':       simple_title,
 768                         'ext':          video_extension.decode('utf-8'),
 769                         'format':       u'NA',
 770                         'player_url':   None,
 771                 }]
 772
 773
 774 class PhotobucketIE(InfoExtractor):
 775         """Information extractor for photobucket.com."""
 776
 777         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 778         IE_NAME = u'photobucket'
 779
 780         def __init__(self, downloader=None):
 781                 InfoExtractor.__init__(self, downloader)
 782
 783         def report_download_webpage(self, video_id):
 784                 """Report webpage download."""
 785                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 786
 787         def report_extraction(self, video_id):
 788                 """Report information extraction."""
 789                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 790
 791         def _real_extract(self, url):
 792                 # Extract id from URL
 793                 mobj = re.match(self._VALID_URL, url)
 794                 if mobj is None:
 795                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 796                         return
 797
 798                 video_id = mobj.group(1)
 799
 800                 video_extension = 'flv'
 801
 802                 # Retrieve video webpage to extract further information
 803                 request = urllib2.Request(url)
 804                 try:
 805                         self.report_download_webpage(video_id)
 806                         webpage = urllib2.urlopen(request).read()
 807                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 808                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 809                         return
 810
 811                 # Extract URL, uploader, and title from webpage
 812                 self.report_extraction(video_id)
 813                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 814                 if mobj is None:
 815                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 816                         return
 817                 mediaURL = urllib.unquote(mobj.group(1))
 818
 819                 video_url = mediaURL
 820
 821                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 822                 if mobj is None:
 823                         self._downloader.trouble(u'ERROR: unable to extract title')
 824                         return
 825                 video_title = mobj.group(1).decode('utf-8')
 826                 video_title = sanitize_title(video_title)
 827                 simple_title = simplify_title(video_title)
 828
 829                 video_uploader = mobj.group(2).decode('utf-8')
 830
 831                 return [{
 832                         'id':           video_id.decode('utf-8'),
 833                         'url':          video_url.decode('utf-8'),
 834                         'uploader':     video_uploader,
 835                         'upload_date':  u'NA',
 836                         'title':        video_title,
 837                         'stitle':       simple_title,
 838                         'ext':          video_extension.decode('utf-8'),
 839                         'format':       u'NA',
 840                         'player_url':   None,
 841                 }]
 842
 843
 844 class YahooIE(InfoExtractor):
 845         """Information extractor for video.yahoo.com."""
 846
 847         # _VALID_URL matches all Yahoo! Video URLs
 848         # _VPAGE_URL matches only the extractable '/watch/' URLs
 849         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 850         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 851         IE_NAME = u'video.yahoo'
 852
 853         def __init__(self, downloader=None):
 854                 InfoExtractor.__init__(self, downloader)
 855
 856         def report_download_webpage(self, video_id):
 857                 """Report webpage download."""
 858                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 859
 860         def report_extraction(self, video_id):
 861                 """Report information extraction."""
 862                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 863
 864         def _real_extract(self, url, new_video=True):
 865                 # Extract ID from URL
 866                 mobj = re.match(self._VALID_URL, url)
 867                 if mobj is None:
 868                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 869                         return
 870
 871                 video_id = mobj.group(2)
 872                 video_extension = 'flv'
 873
 874                 # Rewrite valid but non-extractable URLs as
 875                 # extractable English language /watch/ URLs
 876                 if re.match(self._VPAGE_URL, url) is None:
 877                         request = urllib2.Request(url)
 878                         try:
 879                                 webpage = urllib2.urlopen(request).read()
 880                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 881                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 882                                 return
 883
 884                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 885                         if mobj is None:
 886                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 887                                 return
 888                         yahoo_id = mobj.group(1)
 889
 890                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 891                         if mobj is None:
 892                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 893                                 return
 894                         yahoo_vid = mobj.group(1)
 895
 896                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 897                         return self._real_extract(url, new_video=False)
 898
 899                 # Retrieve video webpage to extract further information
 900                 request = urllib2.Request(url)
 901                 try:
 902                         self.report_download_webpage(video_id)
 903                         webpage = urllib2.urlopen(request).read()
 904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 905                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 906                         return
 907
 908                 # Extract uploader and title from webpage
 909                 self.report_extraction(video_id)
 910                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 911                 if mobj is None:
 912                         self._downloader.trouble(u'ERROR: unable to extract video title')
 913                         return
 914                 video_title = mobj.group(1).decode('utf-8')
 915                 simple_title = simplify_title(video_title)
 916
 917                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 918                 if mobj is None:
 919                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 920                         return
 921                 video_uploader = mobj.group(1).decode('utf-8')
 922
 923                 # Extract video thumbnail
 924                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 925                 if mobj is None:
 926                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 927                         return
 928                 video_thumbnail = mobj.group(1).decode('utf-8')
 929
 930                 # Extract video description
 931                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 932                 if mobj is None:
 933                         self._downloader.trouble(u'ERROR: unable to extract video description')
 934                         return
 935                 video_description = mobj.group(1).decode('utf-8')
 936                 if not video_description:
 937                         video_description = 'No description available.'
 938
 939                 # Extract video height and width
 940                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 941                 if mobj is None:
 942                         self._downloader.trouble(u'ERROR: unable to extract video height')
 943                         return
 944                 yv_video_height = mobj.group(1)
 945
 946                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 947                 if mobj is None:
 948                         self._downloader.trouble(u'ERROR: unable to extract video width')
 949                         return
 950                 yv_video_width = mobj.group(1)
 951
 952                 # Retrieve video playlist to extract media URL
 953                 # I'm not completely sure what all these options are, but we
 954                 # seem to need most of them, otherwise the server sends a 401.
 955                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 956                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 957                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 958                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 959                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 960                 try:
 961                         self.report_download_webpage(video_id)
 962                         webpage = urllib2.urlopen(request).read()
 963                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 964                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 965                         return
 966
 967                 # Extract media URL from playlist XML
 968                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 969                 if mobj is None:
 970                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 971                         return
 972                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 973                 video_url = unescapeHTML(video_url)
 974
 975                 return [{
 976                         'id':           video_id.decode('utf-8'),
 977                         'url':          video_url,
 978                         'uploader':     video_uploader,
 979                         'upload_date':  u'NA',
 980                         'title':        video_title,
 981                         'stitle':       simple_title,
 982                         'ext':          video_extension.decode('utf-8'),
 983                         'thumbnail':    video_thumbnail.decode('utf-8'),
 984                         'description':  video_description,
 985                         'thumbnail':    video_thumbnail,
 986                         'player_url':   None,
 987                 }]
 988
 989
 990 class VimeoIE(InfoExtractor):
 991         """Information extractor for vimeo.com."""
 992
 993         # _VALID_URL matches Vimeo URLs
 994         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 995         IE_NAME = u'vimeo'
 996
 997         def __init__(self, downloader=None):
 998                 InfoExtractor.__init__(self, downloader)
 999
1000         def report_download_webpage(self, video_id):
1001                 """Report webpage download."""
1002                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1003
1004         def report_extraction(self, video_id):
1005                 """Report information extraction."""
1006                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1007
1008         def _real_extract(self, url, new_video=True):
1009                 # Extract ID from URL
1010                 mobj = re.match(self._VALID_URL, url)
1011                 if mobj is None:
1012                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1013                         return
1014
1015                 video_id = mobj.group(1)
1016
1017                 # Retrieve video webpage to extract further information
1018                 request = urllib2.Request(url, None, std_headers)
1019                 try:
1020                         self.report_download_webpage(video_id)
1021                         webpage = urllib2.urlopen(request).read()
1022                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1023                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1024                         return
1025
1026                 # Now we begin extracting as much information as we can from what we
1027                 # retrieved. First we extract the information common to all extractors,
1028                 # and latter we extract those that are Vimeo specific.
1029                 self.report_extraction(video_id)
1030
1031                 # Extract the config JSON
1032                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1033                 try:
1034                         config = json.loads(config)
1035                 except:
1036                         self._downloader.trouble(u'ERROR: unable to extract info section')
1037                         return
1038
1039                 # Extract title
1040                 video_title = config["video"]["title"]
1041                 simple_title = simplify_title(video_title)
1042
1043                 # Extract uploader
1044                 video_uploader = config["video"]["owner"]["name"]
1045
1046                 # Extract video thumbnail
1047                 video_thumbnail = config["video"]["thumbnail"]
1048
1049                 # Extract video description
1050                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1051                 if video_description: video_description = clean_html(video_description)
1052                 else: video_description = ''
1053
1054                 # Extract upload date
1055                 video_upload_date = u'NA'
1056                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1057                 if mobj is not None:
1058                         video_upload_date = mobj.group(1)
1059
1060                 # Vimeo specific: extract request signature and timestamp
1061                 sig = config['request']['signature']
1062                 timestamp = config['request']['timestamp']
1063
1064                 # Vimeo specific: extract video codec and quality information
1065                 # TODO bind to format param
1066                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1067                 for codec in codecs:
1068                         if codec[0] in config["video"]["files"]:
1069                                 video_codec = codec[0]
1070                                 video_extension = codec[1]
1071                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1072                                 else: quality = 'sd'
1073                                 break
1074                 else:
1075                         self._downloader.trouble(u'ERROR: no known codec found')
1076                         return
1077
1078                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1079                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1080
1081                 return [{
1082                         'id':           video_id,
1083                         'url':          video_url,
1084                         'uploader':     video_uploader,
1085                         'upload_date':  video_upload_date,
1086                         'title':        video_title,
1087                         'stitle':       simple_title,
1088                         'ext':          video_extension,
1089                         'thumbnail':    video_thumbnail,
1090                         'description':  video_description,
1091                         'player_url':   None,
1092                 }]
1093
1094
1095 class GenericIE(InfoExtractor):
1096         """Generic last-resort information extractor."""
1097
1098         _VALID_URL = r'.*'
1099         IE_NAME = u'generic'
1100
1101         def __init__(self, downloader=None):
1102                 InfoExtractor.__init__(self, downloader)
1103
1104         def report_download_webpage(self, video_id):
1105                 """Report webpage download."""
1106                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1107                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1108
1109         def report_extraction(self, video_id):
1110                 """Report information extraction."""
1111                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1112
1113         def report_following_redirect(self, new_url):
1114                 """Report information extraction."""
1115                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1116
1117         def _test_redirect(self, url):
1118                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1119                 class HeadRequest(urllib2.Request):
1120                         def get_method(self):
1121                                 return "HEAD"
1122
1123                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1124                         """
1125                         Subclass the HTTPRedirectHandler to make it use our
1126                         HeadRequest also on the redirected URL
1127                         """
1128                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1129                                 if code in (301, 302, 303, 307):
1130                                         newurl = newurl.replace(' ', '%20')
1131                                         newheaders = dict((k,v) for k,v in req.headers.items()
1132                                                                           if k.lower() not in ("content-length", "content-type"))
1133                                         return HeadRequest(newurl,
1134                                                                            headers=newheaders,
1135                                                                            origin_req_host=req.get_origin_req_host(),
1136                                                                            unverifiable=True)
1137                                 else:
1138                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1139
1140                 class HTTPMethodFallback(urllib2.BaseHandler):
1141                         """
1142                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1143                         """
1144                         def http_error_405(self, req, fp, code, msg, headers):
1145                                 fp.read()
1146                                 fp.close()
1147
1148                                 newheaders = dict((k,v) for k,v in req.headers.items()
1149                                                                   if k.lower() not in ("content-length", "content-type"))
1150                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1151                                                                                                  headers=newheaders,
1152                                                                                                  origin_req_host=req.get_origin_req_host(),
1153                                                                                                  unverifiable=True))
1154
1155                 # Build our opener
1156                 opener = urllib2.OpenerDirector()
1157                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1158                                                 HTTPMethodFallback, HEADRedirectHandler,
1159                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1160                         opener.add_handler(handler())
1161
1162                 response = opener.open(HeadRequest(url))
1163                 new_url = response.geturl()
1164
1165                 if url == new_url: return False
1166
1167                 self.report_following_redirect(new_url)
1168                 self._downloader.download([new_url])
1169                 return True
1170
1171         def _real_extract(self, url):
1172                 if self._test_redirect(url): return
1173
1174                 video_id = url.split('/')[-1]
1175                 request = urllib2.Request(url)
1176                 try:
1177                         self.report_download_webpage(video_id)
1178                         webpage = urllib2.urlopen(request).read()
1179                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1180                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1181                         return
1182                 except ValueError, err:
1183                         # since this is the last-resort InfoExtractor, if
1184                         # this error is thrown, it'll be thrown here
1185                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186                         return
1187
1188                 self.report_extraction(video_id)
1189                 # Start with something easy: JW Player in SWFObject
1190                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1191                 if mobj is None:
1192                         # Broaden the search a little bit
1193                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1194                 if mobj is None:
1195                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196                         return
1197
1198                 # It's possible that one of the regexes
1199                 # matched, but returned an empty group:
1200                 if mobj.group(1) is None:
1201                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1202                         return
1203
1204                 video_url = urllib.unquote(mobj.group(1))
1205                 video_id = os.path.basename(video_url)
1206
1207                 # here's a fun little line of code for you:
1208                 video_extension = os.path.splitext(video_id)[1][1:]
1209                 video_id = os.path.splitext(video_id)[0]
1210
1211                 # it's tempting to parse this further, but you would
1212                 # have to take into account all the variations like
1213                 #   Video Title - Site Name
1214                 #   Site Name | Video Title
1215                 #   Video Title - Tagline | Site Name
1216                 # and so on and so forth; it's just not practical
1217                 mobj = re.search(r'<title>(.*)</title>', webpage)
1218                 if mobj is None:
1219                         self._downloader.trouble(u'ERROR: unable to extract title')
1220                         return
1221                 video_title = mobj.group(1).decode('utf-8')
1222                 video_title = sanitize_title(video_title)
1223                 simple_title = simplify_title(video_title)
1224
1225                 # video uploader is domain name
1226                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1227                 if mobj is None:
1228                         self._downloader.trouble(u'ERROR: unable to extract title')
1229                         return
1230                 video_uploader = mobj.group(1).decode('utf-8')
1231
1232                 return [{
1233                         'id':           video_id.decode('utf-8'),
1234                         'url':          video_url.decode('utf-8'),
1235                         'uploader':     video_uploader,
1236                         'upload_date':  u'NA',
1237                         'title':        video_title,
1238                         'stitle':       simple_title,
1239                         'ext':          video_extension.decode('utf-8'),
1240                         'format':       u'NA',
1241                         'player_url':   None,
1242                 }]
1243
1244
1245 class YoutubeSearchIE(InfoExtractor):
1246         """Information Extractor for YouTube search queries."""
1247         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1248         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1249         _max_youtube_results = 1000
1250         IE_NAME = u'youtube:search'
1251
1252         def __init__(self, downloader=None):
1253                 InfoExtractor.__init__(self, downloader)
1254
1255         def report_download_page(self, query, pagenum):
1256                 """Report attempt to download playlist page with given number."""
1257                 query = query.decode(preferredencoding())
1258                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1259
1260         def _real_extract(self, query):
1261                 mobj = re.match(self._VALID_URL, query)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1264                         return
1265
1266                 prefix, query = query.split(':')
1267                 prefix = prefix[8:]
1268                 query = query.encode('utf-8')
1269                 if prefix == '':
1270                         self._download_n_results(query, 1)
1271                         return
1272                 elif prefix == 'all':
1273                         self._download_n_results(query, self._max_youtube_results)
1274                         return
1275                 else:
1276                         try:
1277                                 n = long(prefix)
1278                                 if n <= 0:
1279                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1280                                         return
1281                                 elif n > self._max_youtube_results:
1282                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1283                                         n = self._max_youtube_results
1284                                 self._download_n_results(query, n)
1285                                 return
1286                         except ValueError: # parsing prefix as integer fails
1287                                 self._download_n_results(query, 1)
1288                                 return
1289
1290         def _download_n_results(self, query, n):
1291                 """Downloads a specified number of results for a query"""
1292
1293                 video_ids = []
1294                 pagenum = 0
1295                 limit = n
1296
1297                 while (50 * pagenum) < limit:
1298                         self.report_download_page(query, pagenum+1)
1299                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1300                         request = urllib2.Request(result_url)
1301                         try:
1302                                 data = urllib2.urlopen(request).read()
1303                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1304                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1305                                 return
1306                         api_response = json.loads(data)['data']
1307
1308                         new_ids = list(video['id'] for video in api_response['items'])
1309                         video_ids += new_ids
1310
1311                         limit = min(n, api_response['totalItems'])
1312                         pagenum += 1
1313
1314                 if len(video_ids) > n:
1315                         video_ids = video_ids[:n]
1316                 for id in video_ids:
1317                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1318                 return
1319
1320
1321 class GoogleSearchIE(InfoExtractor):
1322         """Information Extractor for Google Video search queries."""
1323         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1324         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1325         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1326         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1327         _max_google_results = 1000
1328         IE_NAME = u'video.google:search'
1329
1330         def __init__(self, downloader=None):
1331                 InfoExtractor.__init__(self, downloader)
1332
1333         def report_download_page(self, query, pagenum):
1334                 """Report attempt to download playlist page with given number."""
1335                 query = query.decode(preferredencoding())
1336                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1337
1338         def _real_extract(self, query):
1339                 mobj = re.match(self._VALID_URL, query)
1340                 if mobj is None:
1341                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1342                         return
1343
1344                 prefix, query = query.split(':')
1345                 prefix = prefix[8:]
1346                 query = query.encode('utf-8')
1347                 if prefix == '':
1348                         self._download_n_results(query, 1)
1349                         return
1350                 elif prefix == 'all':
1351                         self._download_n_results(query, self._max_google_results)
1352                         return
1353                 else:
1354                         try:
1355                                 n = long(prefix)
1356                                 if n <= 0:
1357                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1358                                         return
1359                                 elif n > self._max_google_results:
1360                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1361                                         n = self._max_google_results
1362                                 self._download_n_results(query, n)
1363                                 return
1364                         except ValueError: # parsing prefix as integer fails
1365                                 self._download_n_results(query, 1)
1366                                 return
1367
1368         def _download_n_results(self, query, n):
1369                 """Downloads a specified number of results for a query"""
1370
1371                 video_ids = []
1372                 pagenum = 0
1373
1374                 while True:
1375                         self.report_download_page(query, pagenum)
1376                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1377                         request = urllib2.Request(result_url)
1378                         try:
1379                                 page = urllib2.urlopen(request).read()
1380                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1381                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1382                                 return
1383
1384                         # Extract video identifiers
1385                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1386                                 video_id = mobj.group(1)
1387                                 if video_id not in video_ids:
1388                                         video_ids.append(video_id)
1389                                         if len(video_ids) == n:
1390                                                 # Specified n videos reached
1391                                                 for id in video_ids:
1392                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1393                                                 return
1394
1395                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1396                                 for id in video_ids:
1397                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1398                                 return
1399
1400                         pagenum = pagenum + 1
1401
1402
1403 class YahooSearchIE(InfoExtractor):
1404         """Information Extractor for Yahoo! Video search queries."""
1405         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1406         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1407         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1408         _MORE_PAGES_INDICATOR = r'\s*Next'
1409         _max_yahoo_results = 1000
1410         IE_NAME = u'video.yahoo:search'
1411
1412         def __init__(self, downloader=None):
1413                 InfoExtractor.__init__(self, downloader)
1414
1415         def report_download_page(self, query, pagenum):
1416                 """Report attempt to download playlist page with given number."""
1417                 query = query.decode(preferredencoding())
1418                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1419
1420         def _real_extract(self, query):
1421                 mobj = re.match(self._VALID_URL, query)
1422                 if mobj is None:
1423                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1424                         return
1425
1426                 prefix, query = query.split(':')
1427                 prefix = prefix[8:]
1428                 query = query.encode('utf-8')
1429                 if prefix == '':
1430                         self._download_n_results(query, 1)
1431                         return
1432                 elif prefix == 'all':
1433                         self._download_n_results(query, self._max_yahoo_results)
1434                         return
1435                 else:
1436                         try:
1437                                 n = long(prefix)
1438                                 if n <= 0:
1439                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1440                                         return
1441                                 elif n > self._max_yahoo_results:
1442                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1443                                         n = self._max_yahoo_results
1444                                 self._download_n_results(query, n)
1445                                 return
1446                         except ValueError: # parsing prefix as integer fails
1447                                 self._download_n_results(query, 1)
1448                                 return
1449
1450         def _download_n_results(self, query, n):
1451                 """Downloads a specified number of results for a query"""
1452
1453                 video_ids = []
1454                 already_seen = set()
1455                 pagenum = 1
1456
1457                 while True:
1458                         self.report_download_page(query, pagenum)
1459                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1460                         request = urllib2.Request(result_url)
1461                         try:
1462                                 page = urllib2.urlopen(request).read()
1463                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1464                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1465                                 return
1466
1467                         # Extract video identifiers
1468                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1469                                 video_id = mobj.group(1)
1470                                 if video_id not in already_seen:
1471                                         video_ids.append(video_id)
1472                                         already_seen.add(video_id)
1473                                         if len(video_ids) == n:
1474                                                 # Specified n videos reached
1475                                                 for id in video_ids:
1476                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1477                                                 return
1478
1479                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1480                                 for id in video_ids:
1481                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1482                                 return
1483
1484                         pagenum = pagenum + 1
1485
1486
1487 class YoutubePlaylistIE(InfoExtractor):
1488         """Information Extractor for YouTube playlists."""
1489
1490         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1491         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1492         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1493         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1494         IE_NAME = u'youtube:playlist'
1495
1496         def __init__(self, downloader=None):
1497                 InfoExtractor.__init__(self, downloader)
1498
1499         def report_download_page(self, playlist_id, pagenum):
1500                 """Report attempt to download playlist page with given number."""
1501                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1502
1503         def _real_extract(self, url):
1504                 # Extract playlist id
1505                 mobj = re.match(self._VALID_URL, url)
1506                 if mobj is None:
1507                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1508                         return
1509
1510                 # Single video case
1511                 if mobj.group(3) is not None:
1512                         self._downloader.download([mobj.group(3)])
1513                         return
1514
1515                 # Download playlist pages
1516                 # prefix is 'p' as default for playlists but there are other types that need extra care
1517                 playlist_prefix = mobj.group(1)
1518                 if playlist_prefix == 'a':
1519                         playlist_access = 'artist'
1520                 else:
1521                         playlist_prefix = 'p'
1522                         playlist_access = 'view_play_list'
1523                 playlist_id = mobj.group(2)
1524                 video_ids = []
1525                 pagenum = 1
1526
1527                 while True:
1528                         self.report_download_page(playlist_id, pagenum)
1529                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1530                         request = urllib2.Request(url)
1531                         try:
1532                                 page = urllib2.urlopen(request).read()
1533                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1534                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1535                                 return
1536
1537                         # Extract video identifiers
1538                         ids_in_page = []
1539                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1540                                 if mobj.group(1) not in ids_in_page:
1541                                         ids_in_page.append(mobj.group(1))
1542                         video_ids.extend(ids_in_page)
1543
1544                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1545                                 break
1546                         pagenum = pagenum + 1
1547
1548                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1549                 playlistend = self._downloader.params.get('playlistend', -1)
1550                 if playlistend == -1:
1551                         video_ids = video_ids[playliststart:]
1552                 else:
1553                         video_ids = video_ids[playliststart:playlistend]
1554
1555                 for id in video_ids:
1556                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1557                 return
1558
1559
1560 class YoutubeUserIE(InfoExtractor):
1561         """Information Extractor for YouTube users."""
1562
1563         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1564         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1565         _GDATA_PAGE_SIZE = 50
1566         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1567         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1568         IE_NAME = u'youtube:user'
1569
1570         def __init__(self, downloader=None):
1571                 InfoExtractor.__init__(self, downloader)
1572
1573         def report_download_page(self, username, start_index):
1574                 """Report attempt to download user page."""
1575                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1576                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1577
1578         def _real_extract(self, url):
1579                 # Extract username
1580                 mobj = re.match(self._VALID_URL, url)
1581                 if mobj is None:
1582                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1583                         return
1584
1585                 username = mobj.group(1)
1586
1587                 # Download video ids using YouTube Data API. Result size per
1588                 # query is limited (currently to 50 videos) so we need to query
1589                 # page by page until there are no video ids - it means we got
1590                 # all of them.
1591
1592                 video_ids = []
1593                 pagenum = 0
1594
1595                 while True:
1596                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1597                         self.report_download_page(username, start_index)
1598
1599                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1600
1601                         try:
1602                                 page = urllib2.urlopen(request).read()
1603                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1604                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1605                                 return
1606
1607                         # Extract video identifiers
1608                         ids_in_page = []
1609
1610                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1611                                 if mobj.group(1) not in ids_in_page:
1612                                         ids_in_page.append(mobj.group(1))
1613
1614                         video_ids.extend(ids_in_page)
1615
1616                         # A little optimization - if current page is not
1617                         # "full", ie. does not contain PAGE_SIZE video ids then
1618                         # we can assume that this page is the last one - there
1619                         # are no more ids on further pages - no need to query
1620                         # again.
1621
1622                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1623                                 break
1624
1625                         pagenum += 1
1626
1627                 all_ids_count = len(video_ids)
1628                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1629                 playlistend = self._downloader.params.get('playlistend', -1)
1630
1631                 if playlistend == -1:
1632                         video_ids = video_ids[playliststart:]
1633                 else:
1634                         video_ids = video_ids[playliststart:playlistend]
1635
1636                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1637                                 (username, all_ids_count, len(video_ids)))
1638
1639                 for video_id in video_ids:
1640                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1641
1642
1643 class DepositFilesIE(InfoExtractor):
1644         """Information extractor for depositfiles.com"""
1645
1646         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1647         IE_NAME = u'DepositFiles'
1648
1649         def __init__(self, downloader=None):
1650                 InfoExtractor.__init__(self, downloader)
1651
1652         def report_download_webpage(self, file_id):
1653                 """Report webpage download."""
1654                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1655
1656         def report_extraction(self, file_id):
1657                 """Report information extraction."""
1658                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1659
1660         def _real_extract(self, url):
1661                 file_id = url.split('/')[-1]
1662                 # Rebuild url in english locale
1663                 url = 'http://depositfiles.com/en/files/' + file_id
1664
1665                 # Retrieve file webpage with 'Free download' button pressed
1666                 free_download_indication = { 'gateway_result' : '1' }
1667                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1668                 try:
1669                         self.report_download_webpage(file_id)
1670                         webpage = urllib2.urlopen(request).read()
1671                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1672                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1673                         return
1674
1675                 # Search for the real file URL
1676                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1677                 if (mobj is None) or (mobj.group(1) is None):
1678                         # Try to figure out reason of the error.
1679                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1680                         if (mobj is not None) and (mobj.group(1) is not None):
1681                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1682                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1683                         else:
1684                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1685                         return
1686
1687                 file_url = mobj.group(1)
1688                 file_extension = os.path.splitext(file_url)[1][1:]
1689
1690                 # Search for file title
1691                 mobj = re.search(r'<b title="(.*?)">', webpage)
1692                 if mobj is None:
1693                         self._downloader.trouble(u'ERROR: unable to extract title')
1694                         return
1695                 file_title = mobj.group(1).decode('utf-8')
1696
1697                 return [{
1698                         'id':           file_id.decode('utf-8'),
1699                         'url':          file_url.decode('utf-8'),
1700                         'uploader':     u'NA',
1701                         'upload_date':  u'NA',
1702                         'title':        file_title,
1703                         'stitle':       file_title,
1704                         'ext':          file_extension.decode('utf-8'),
1705                         'format':       u'NA',
1706                         'player_url':   None,
1707                 }]
1708
1709
1710 class FacebookIE(InfoExtractor):
1711         """Information Extractor for Facebook"""
1712
1713         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1714         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1715         _NETRC_MACHINE = 'facebook'
1716         _available_formats = ['video', 'highqual', 'lowqual']
1717         _video_extensions = {
1718                 'video': 'mp4',
1719                 'highqual': 'mp4',
1720                 'lowqual': 'mp4',
1721         }
1722         IE_NAME = u'facebook'
1723
1724         def __init__(self, downloader=None):
1725                 InfoExtractor.__init__(self, downloader)
1726
1727         def _reporter(self, message):
1728                 """Add header and report message."""
1729                 self._downloader.to_screen(u'[facebook] %s' % message)
1730
1731         def report_login(self):
1732                 """Report attempt to log in."""
1733                 self._reporter(u'Logging in')
1734
1735         def report_video_webpage_download(self, video_id):
1736                 """Report attempt to download video webpage."""
1737                 self._reporter(u'%s: Downloading video webpage' % video_id)
1738
1739         def report_information_extraction(self, video_id):
1740                 """Report attempt to extract video information."""
1741                 self._reporter(u'%s: Extracting video information' % video_id)
1742
1743         def _parse_page(self, video_webpage):
1744                 """Extract video information from page"""
1745                 # General data
1746                 data = {'title': r'\("video_title", "(.*?)"\)',
1747                         'description': r'<div class="datawrap">(.*?)</div>',
1748                         'owner': r'\("video_owner_name", "(.*?)"\)',
1749                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1750                         }
1751                 video_info = {}
1752                 for piece in data.keys():
1753                         mobj = re.search(data[piece], video_webpage)
1754                         if mobj is not None:
1755                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1756
1757                 # Video urls
1758                 video_urls = {}
1759                 for fmt in self._available_formats:
1760                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1761                         if mobj is not None:
1762                                 # URL is in a Javascript segment inside an escaped Unicode format within
1763                                 # the generally utf-8 page
1764                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1765                 video_info['video_urls'] = video_urls
1766
1767                 return video_info
1768
1769         def _real_initialize(self):
1770                 if self._downloader is None:
1771                         return
1772
1773                 useremail = None
1774                 password = None
1775                 downloader_params = self._downloader.params
1776
1777                 # Attempt to use provided username and password or .netrc data
1778                 if downloader_params.get('username', None) is not None:
1779                         useremail = downloader_params['username']
1780                         password = downloader_params['password']
1781                 elif downloader_params.get('usenetrc', False):
1782                         try:
1783                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1784                                 if info is not None:
1785                                         useremail = info[0]
1786                                         password = info[2]
1787                                 else:
1788                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1789                         except (IOError, netrc.NetrcParseError), err:
1790                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1791                                 return
1792
1793                 if useremail is None:
1794                         return
1795
1796                 # Log in
1797                 login_form = {
1798                         'email': useremail,
1799                         'pass': password,
1800                         'login': 'Log+In'
1801                         }
1802                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1803                 try:
1804                         self.report_login()
1805                         login_results = urllib2.urlopen(request).read()
1806                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1807                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1808                                 return
1809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1811                         return
1812
1813         def _real_extract(self, url):
1814                 mobj = re.match(self._VALID_URL, url)
1815                 if mobj is None:
1816                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1817                         return
1818                 video_id = mobj.group('ID')
1819
1820                 # Get video webpage
1821                 self.report_video_webpage_download(video_id)
1822                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1823                 try:
1824                         page = urllib2.urlopen(request)
1825                         video_webpage = page.read()
1826                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1827                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1828                         return
1829
1830                 # Start extracting information
1831                 self.report_information_extraction(video_id)
1832
1833                 # Extract information
1834                 video_info = self._parse_page(video_webpage)
1835
1836                 # uploader
1837                 if 'owner' not in video_info:
1838                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1839                         return
1840                 video_uploader = video_info['owner']
1841
1842                 # title
1843                 if 'title' not in video_info:
1844                         self._downloader.trouble(u'ERROR: unable to extract video title')
1845                         return
1846                 video_title = video_info['title']
1847                 video_title = video_title.decode('utf-8')
1848                 video_title = sanitize_title(video_title)
1849
1850                 simple_title = simplify_title(video_title)
1851
1852                 # thumbnail image
1853                 if 'thumbnail' not in video_info:
1854                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1855                         video_thumbnail = ''
1856                 else:
1857                         video_thumbnail = video_info['thumbnail']
1858
1859                 # upload date
1860                 upload_date = u'NA'
1861                 if 'upload_date' in video_info:
1862                         upload_time = video_info['upload_date']
1863                         timetuple = email.utils.parsedate_tz(upload_time)
1864                         if timetuple is not None:
1865                                 try:
1866                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1867                                 except:
1868                                         pass
1869
1870                 # description
1871                 video_description = video_info.get('description', 'No description available.')
1872
1873                 url_map = video_info['video_urls']
1874                 if len(url_map.keys()) > 0:
1875                         # Decide which formats to download
1876                         req_format = self._downloader.params.get('format', None)
1877                         format_limit = self._downloader.params.get('format_limit', None)
1878
1879                         if format_limit is not None and format_limit in self._available_formats:
1880                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1881                         else:
1882                                 format_list = self._available_formats
1883                         existing_formats = [x for x in format_list if x in url_map]
1884                         if len(existing_formats) == 0:
1885                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1886                                 return
1887                         if req_format is None:
1888                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1889                         elif req_format == 'worst':
1890                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1891                         elif req_format == '-1':
1892                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1893                         else:
1894                                 # Specific format
1895                                 if req_format not in url_map:
1896                                         self._downloader.trouble(u'ERROR: requested format not available')
1897                                         return
1898                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1899
1900                 results = []
1901                 for format_param, video_real_url in video_url_list:
1902                         # Extension
1903                         video_extension = self._video_extensions.get(format_param, 'mp4')
1904
1905                         results.append({
1906                                 'id':           video_id.decode('utf-8'),
1907                                 'url':          video_real_url.decode('utf-8'),
1908                                 'uploader':     video_uploader.decode('utf-8'),
1909                                 'upload_date':  upload_date,
1910                                 'title':        video_title,
1911                                 'stitle':       simple_title,
1912                                 'ext':          video_extension.decode('utf-8'),
1913                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1914                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1915                                 'description':  video_description.decode('utf-8'),
1916                                 'player_url':   None,
1917                         })
1918                 return results
1919
1920 class BlipTVIE(InfoExtractor):
1921         """Information extractor for blip.tv"""
1922
1923         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1924         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1925         IE_NAME = u'blip.tv'
1926
1927         def report_extraction(self, file_id):
1928                 """Report information extraction."""
1929                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1930
1931         def report_direct_download(self, title):
1932                 """Report information extraction."""
1933                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1934
1935         def _real_extract(self, url):
1936                 mobj = re.match(self._VALID_URL, url)
1937                 if mobj is None:
1938                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1939                         return
1940
1941                 if '?' in url:
1942                         cchar = '&'
1943                 else:
1944                         cchar = '?'
1945                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1946                 request = urllib2.Request(json_url)
1947                 self.report_extraction(mobj.group(1))
1948                 info = None
1949                 try:
1950                         urlh = urllib2.urlopen(request)
1951                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1952                                 basename = url.split('/')[-1]
1953                                 title,ext = os.path.splitext(basename)
1954                                 title = title.decode('UTF-8')
1955                                 ext = ext.replace('.', '')
1956                                 self.report_direct_download(title)
1957                                 info = {
1958                                         'id': title,
1959                                         'url': url,
1960                                         'title': title,
1961                                         'stitle': simplify_title(title),
1962                                         'ext': ext,
1963                                         'urlhandle': urlh
1964                                 }
1965                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1966                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1967                         return
1968                 if info is None: # Regular URL
1969                         try:
1970                                 json_code = urlh.read()
1971                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1973                                 return
1974
1975                         try:
1976                                 json_data = json.loads(json_code)
1977                                 if 'Post' in json_data:
1978                                         data = json_data['Post']
1979                                 else:
1980                                         data = json_data
1981
1982                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1983                                 video_url = data['media']['url']
1984                                 umobj = re.match(self._URL_EXT, video_url)
1985                                 if umobj is None:
1986                                         raise ValueError('Can not determine filename extension')
1987                                 ext = umobj.group(1)
1988
1989                                 info = {
1990                                         'id': data['item_id'],
1991                                         'url': video_url,
1992                                         'uploader': data['display_name'],
1993                                         'upload_date': upload_date,
1994                                         'title': data['title'],
1995                                         'stitle': simplify_title(data['title']),
1996                                         'ext': ext,
1997                                         'format': data['media']['mimeType'],
1998                                         'thumbnail': data['thumbnailUrl'],
1999                                         'description': data['description'],
2000                                         'player_url': data['embedUrl']
2001                                 }
2002                         except (ValueError,KeyError), err:
2003                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2004                                 return
2005
2006                 return [info]
2007
2008
2009 class MyVideoIE(InfoExtractor):
2010         """Information Extractor for myvideo.de."""
2011
2012         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2013         IE_NAME = u'myvideo'
2014
2015         def __init__(self, downloader=None):
2016                 InfoExtractor.__init__(self, downloader)
2017
2018         def report_download_webpage(self, video_id):
2019                 """Report webpage download."""
2020                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2021
2022         def report_extraction(self, video_id):
2023                 """Report information extraction."""
2024                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2025
2026         def _real_extract(self,url):
2027                 mobj = re.match(self._VALID_URL, url)
2028                 if mobj is None:
2029                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2030                         return
2031
2032                 video_id = mobj.group(1)
2033
2034                 # Get video webpage
2035                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2036                 try:
2037                         self.report_download_webpage(video_id)
2038                         webpage = urllib2.urlopen(request).read()
2039                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2040                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2041                         return
2042
2043                 self.report_extraction(video_id)
2044                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2045                                  webpage)
2046                 if mobj is None:
2047                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2048                         return
2049                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2050
2051                 mobj = re.search('<title>([^<]+)</title>', webpage)
2052                 if mobj is None:
2053                         self._downloader.trouble(u'ERROR: unable to extract title')
2054                         return
2055
2056                 video_title = mobj.group(1)
2057                 video_title = sanitize_title(video_title)
2058
2059                 simple_title = simplify_title(video_title)
2060
2061                 return [{
2062                         'id':           video_id,
2063                         'url':          video_url,
2064                         'uploader':     u'NA',
2065                         'upload_date':  u'NA',
2066                         'title':        video_title,
2067                         'stitle':       simple_title,
2068                         'ext':          u'flv',
2069                         'format':       u'NA',
2070                         'player_url':   None,
2071                 }]
2072
2073 class ComedyCentralIE(InfoExtractor):
2074         """Information extractor for The Daily Show and Colbert Report """
2075
2076         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2077         IE_NAME = u'comedycentral'
2078
2079         def report_extraction(self, episode_id):
2080                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2081
2082         def report_config_download(self, episode_id):
2083                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2084
2085         def report_index_download(self, episode_id):
2086                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2087
2088         def report_player_url(self, episode_id):
2089                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2090
2091         def _real_extract(self, url):
2092                 mobj = re.match(self._VALID_URL, url)
2093                 if mobj is None:
2094                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2095                         return
2096
2097                 if mobj.group('shortname'):
2098                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2099                                 url = u'http://www.thedailyshow.com/full-episodes/'
2100                         else:
2101                                 url = u'http://www.colbertnation.com/full-episodes/'
2102                         mobj = re.match(self._VALID_URL, url)
2103                         assert mobj is not None
2104
2105                 dlNewest = not mobj.group('episode')
2106                 if dlNewest:
2107                         epTitle = mobj.group('showname')
2108                 else:
2109                         epTitle = mobj.group('episode')
2110
2111                 req = urllib2.Request(url)
2112                 self.report_extraction(epTitle)
2113                 try:
2114                         htmlHandle = urllib2.urlopen(req)
2115                         html = htmlHandle.read()
2116                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2118                         return
2119                 if dlNewest:
2120                         url = htmlHandle.geturl()
2121                         mobj = re.match(self._VALID_URL, url)
2122                         if mobj is None:
2123                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2124                                 return
2125                         if mobj.group('episode') == '':
2126                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2127                                 return
2128                         epTitle = mobj.group('episode')
2129
2130                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2131                 if len(mMovieParams) == 0:
2132                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2133                         return
2134
2135                 playerUrl_raw = mMovieParams[0][0]
2136                 self.report_player_url(epTitle)
2137                 try:
2138                         urlHandle = urllib2.urlopen(playerUrl_raw)
2139                         playerUrl = urlHandle.geturl()
2140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2142                         return
2143
2144                 uri = mMovieParams[0][1]
2145                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2146                 self.report_index_download(epTitle)
2147                 try:
2148                         indexXml = urllib2.urlopen(indexUrl).read()
2149                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2151                         return
2152
2153                 results = []
2154
2155                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2156                 itemEls = idoc.findall('.//item')
2157                 for itemEl in itemEls:
2158                         mediaId = itemEl.findall('./guid')[0].text
2159                         shortMediaId = mediaId.split(':')[-1]
2160                         showId = mediaId.split(':')[-2].replace('.com', '')
2161                         officialTitle = itemEl.findall('./title')[0].text
2162                         officialDate = itemEl.findall('./pubDate')[0].text
2163
2164                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2165                                                 urllib.urlencode({'uri': mediaId}))
2166                         configReq = urllib2.Request(configUrl)
2167                         self.report_config_download(epTitle)
2168                         try:
2169                                 configXml = urllib2.urlopen(configReq).read()
2170                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2171                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2172                                 return
2173
2174                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2175                         turls = []
2176                         for rendition in cdoc.findall('.//rendition'):
2177                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2178                                 turls.append(finfo)
2179
2180                         if len(turls) == 0:
2181                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2182                                 continue
2183
2184                         # For now, just pick the highest bitrate
2185                         format,video_url = turls[-1]
2186
2187                         effTitle = showId + u'-' + epTitle
2188                         info = {
2189                                 'id': shortMediaId,
2190                                 'url': video_url,
2191                                 'uploader': showId,
2192                                 'upload_date': officialDate,
2193                                 'title': effTitle,
2194                                 'stitle': simplify_title(effTitle),
2195                                 'ext': 'mp4',
2196                                 'format': format,
2197                                 'thumbnail': None,
2198                                 'description': officialTitle,
2199                                 'player_url': playerUrl
2200                         }
2201
2202                         results.append(info)
2203
2204                 return results
2205
2206
2207 class EscapistIE(InfoExtractor):
2208         """Information extractor for The Escapist """
2209
2210         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2211         IE_NAME = u'escapist'
2212
2213         def report_extraction(self, showName):
2214                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2215
2216         def report_config_download(self, showName):
2217                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2218
2219         def _real_extract(self, url):
2220                 mobj = re.match(self._VALID_URL, url)
2221                 if mobj is None:
2222                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2223                         return
2224                 showName = mobj.group('showname')
2225                 videoId = mobj.group('episode')
2226
2227                 self.report_extraction(showName)
2228                 try:
2229                         webPage = urllib2.urlopen(url).read()
2230                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2231                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2232                         return
2233
2234                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2235                 description = unescapeHTML(descMatch.group(1))
2236                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2237                 imgUrl = unescapeHTML(imgMatch.group(1))
2238                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2239                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2240                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2241                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2242
2243                 self.report_config_download(showName)
2244                 try:
2245                         configJSON = urllib2.urlopen(configUrl).read()
2246                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2247                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2248                         return
2249
2250                 # Technically, it's JavaScript, not JSON
2251                 configJSON = configJSON.replace("'", '"')
2252
2253                 try:
2254                         config = json.loads(configJSON)
2255                 except (ValueError,), err:
2256                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2257                         return
2258
2259                 playlist = config['playlist']
2260                 videoUrl = playlist[1]['url']
2261
2262                 info = {
2263                         'id': videoId,
2264                         'url': videoUrl,
2265                         'uploader': showName,
2266                         'upload_date': None,
2267                         'title': showName,
2268                         'stitle': simplify_title(showName),
2269                         'ext': 'flv',
2270                         'format': 'flv',
2271                         'thumbnail': imgUrl,
2272                         'description': description,
2273                         'player_url': playerUrl,
2274                 }
2275
2276                 return [info]
2277
2278
2279 class CollegeHumorIE(InfoExtractor):
2280         """Information extractor for collegehumor.com"""
2281
2282         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2283         IE_NAME = u'collegehumor'
2284
2285         def report_webpage(self, video_id):
2286                 """Report information extraction."""
2287                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2288
2289         def report_extraction(self, video_id):
2290                 """Report information extraction."""
2291                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2292
2293         def _real_extract(self, url):
2294                 mobj = re.match(self._VALID_URL, url)
2295                 if mobj is None:
2296                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2297                         return
2298                 video_id = mobj.group('videoid')
2299
2300                 self.report_webpage(video_id)
2301                 request = urllib2.Request(url)
2302                 try:
2303                         webpage = urllib2.urlopen(request).read()
2304                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2305                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2306                         return
2307
2308                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2309                 if m is None:
2310                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2311                         return
2312                 internal_video_id = m.group('internalvideoid')
2313
2314                 info = {
2315                         'id': video_id,
2316                         'internal_id': internal_video_id,
2317                 }
2318
2319                 self.report_extraction(video_id)
2320                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2321                 try:
2322                         metaXml = urllib2.urlopen(xmlUrl).read()
2323                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2324                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2325                         return
2326
2327                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2328                 try:
2329                         videoNode = mdoc.findall('./video')[0]
2330                         info['description'] = videoNode.findall('./description')[0].text
2331                         info['title'] = videoNode.findall('./caption')[0].text
2332                         info['stitle'] = simplify_title(info['title'])
2333                         info['url'] = videoNode.findall('./file')[0].text
2334                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2335                         info['ext'] = info['url'].rpartition('.')[2]
2336                         info['format'] = info['ext']
2337                 except IndexError:
2338                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2339                         return
2340
2341                 return [info]
2342
2343
2344 class XVideosIE(InfoExtractor):
2345         """Information extractor for xvideos.com"""
2346
2347         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2348         IE_NAME = u'xvideos'
2349
2350         def report_webpage(self, video_id):
2351                 """Report information extraction."""
2352                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2353
2354         def report_extraction(self, video_id):
2355                 """Report information extraction."""
2356                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2357
2358         def _real_extract(self, url):
2359                 mobj = re.match(self._VALID_URL, url)
2360                 if mobj is None:
2361                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2362                         return
2363                 video_id = mobj.group(1).decode('utf-8')
2364
2365                 self.report_webpage(video_id)
2366
2367                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2368                 try:
2369                         webpage = urllib2.urlopen(request).read()
2370                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2372                         return
2373
2374                 self.report_extraction(video_id)
2375
2376
2377                 # Extract video URL
2378                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2379                 if mobj is None:
2380                         self._downloader.trouble(u'ERROR: unable to extract video url')
2381                         return
2382                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2383
2384
2385                 # Extract title
2386                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2387                 if mobj is None:
2388                         self._downloader.trouble(u'ERROR: unable to extract video title')
2389                         return
2390                 video_title = mobj.group(1).decode('utf-8')
2391
2392
2393                 # Extract video thumbnail
2394                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2395                 if mobj is None:
2396                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2397                         return
2398                 video_thumbnail = mobj.group(1).decode('utf-8')
2399
2400                 info = {
2401                         'id': video_id,
2402                         'url': video_url,
2403                         'uploader': None,
2404                         'upload_date': None,
2405                         'title': video_title,
2406                         'stitle': simplify_title(video_title),
2407                         'ext': 'flv',
2408                         'format': 'flv',
2409                         'thumbnail': video_thumbnail,
2410                         'description': None,
2411                         'player_url': None,
2412                 }
2413
2414                 return [info]
2415
2416
2417 class SoundcloudIE(InfoExtractor):
2418         """Information extractor for soundcloud.com
2419            To access the media, the uid of the song and a stream token
2420            must be extracted from the page source and the script must make
2421            a request to media.soundcloud.com/crossdomain.xml. Then
2422            the media can be grabbed by requesting from an url composed
2423            of the stream token and uid
2424          """
2425
2426         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2427         IE_NAME = u'soundcloud'
2428
2429         def __init__(self, downloader=None):
2430                 InfoExtractor.__init__(self, downloader)
2431
2432         def report_webpage(self, video_id):
2433                 """Report information extraction."""
2434                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2435
2436         def report_extraction(self, video_id):
2437                 """Report information extraction."""
2438                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2439
2440         def _real_extract(self, url):
2441                 mobj = re.match(self._VALID_URL, url)
2442                 if mobj is None:
2443                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2444                         return
2445
2446                 # extract uploader (which is in the url)
2447                 uploader = mobj.group(1).decode('utf-8')
2448                 # extract simple title (uploader + slug of song title)
2449                 slug_title =  mobj.group(2).decode('utf-8')
2450                 simple_title = uploader + '-' + slug_title
2451
2452                 self.report_webpage('%s/%s' % (uploader, slug_title))
2453
2454                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2455                 try:
2456                         webpage = urllib2.urlopen(request).read()
2457                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2458                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2459                         return
2460
2461                 self.report_extraction('%s/%s' % (uploader, slug_title))
2462
2463                 # extract uid and stream token that soundcloud hands out for access
2464                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2465                 if mobj:
2466                         video_id = mobj.group(1)
2467                         stream_token = mobj.group(2)
2468
2469                 # extract unsimplified title
2470                 mobj = re.search('"title":"(.*?)",', webpage)
2471                 if mobj:
2472                         title = mobj.group(1)
2473
2474                 # construct media url (with uid/token)
2475                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2476                 mediaURL = mediaURL % (video_id, stream_token)
2477
2478                 # description
2479                 description = u'No description available'
2480                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2481                 if mobj:
2482                         description = mobj.group(1)
2483
2484                 # upload date
2485                 upload_date = None
2486                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2487                 if mobj:
2488                         try:
2489                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2490                         except Exception, e:
2491                                 print str(e)
2492
2493                 # for soundcloud, a request to a cross domain is required for cookies
2494                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2495
2496                 return [{
2497                         'id':           video_id.decode('utf-8'),
2498                         'url':          mediaURL,
2499                         'uploader':     uploader.decode('utf-8'),
2500                         'upload_date':  upload_date,
2501                         'title':        simple_title.decode('utf-8'),
2502                         'stitle':       simple_title.decode('utf-8'),
2503                         'ext':          u'mp3',
2504                         'format':       u'NA',
2505                         'player_url':   None,
2506                         'description': description.decode('utf-8')
2507                 }]
2508
2509
2510 class InfoQIE(InfoExtractor):
2511         """Information extractor for infoq.com"""
2512
2513         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2514         IE_NAME = u'infoq'
2515
2516         def report_webpage(self, video_id):
2517                 """Report information extraction."""
2518                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519
2520         def report_extraction(self, video_id):
2521                 """Report information extraction."""
2522                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523
2524         def _real_extract(self, url):
2525                 mobj = re.match(self._VALID_URL, url)
2526                 if mobj is None:
2527                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2528                         return
2529
2530                 self.report_webpage(url)
2531
2532                 request = urllib2.Request(url)
2533                 try:
2534                         webpage = urllib2.urlopen(request).read()
2535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2536                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2537                         return
2538
2539                 self.report_extraction(url)
2540
2541
2542                 # Extract video URL
2543                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2544                 if mobj is None:
2545                         self._downloader.trouble(u'ERROR: unable to extract video url')
2546                         return
2547                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2548
2549
2550                 # Extract title
2551                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2552                 if mobj is None:
2553                         self._downloader.trouble(u'ERROR: unable to extract video title')
2554                         return
2555                 video_title = mobj.group(1).decode('utf-8')
2556
2557                 # Extract description
2558                 video_description = u'No description available.'
2559                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2560                 if mobj is not None:
2561                         video_description = mobj.group(1).decode('utf-8')
2562
2563                 video_filename = video_url.split('/')[-1]
2564                 video_id, extension = video_filename.split('.')
2565
2566                 info = {
2567                         'id': video_id,
2568                         'url': video_url,
2569                         'uploader': None,
2570                         'upload_date': None,
2571                         'title': video_title,
2572                         'stitle': simplify_title(video_title),
2573                         'ext': extension,
2574                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2575                         'thumbnail': None,
2576                         'description': video_description,
2577                         'player_url': None,
2578                 }
2579
2580                 return [info]
2581
2582 class MixcloudIE(InfoExtractor):
2583         """Information extractor for www.mixcloud.com"""
2584         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2585         IE_NAME = u'mixcloud'
2586
2587         def __init__(self, downloader=None):
2588                 InfoExtractor.__init__(self, downloader)
2589
2590         def report_download_json(self, file_id):
2591                 """Report JSON download."""
2592                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2593
2594         def report_extraction(self, file_id):
2595                 """Report information extraction."""
2596                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2597
2598         def get_urls(self, jsonData, fmt, bitrate='best'):
2599                 """Get urls from 'audio_formats' section in json"""
2600                 file_url = None
2601                 try:
2602                         bitrate_list = jsonData[fmt]
2603                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2604                                 bitrate = max(bitrate_list) # select highest
2605
2606                         url_list = jsonData[fmt][bitrate]
2607                 except TypeError: # we have no bitrate info.
2608                         url_list = jsonData[fmt]
2609                 return url_list
2610
2611         def check_urls(self, url_list):
2612                 """Returns 1st active url from list"""
2613                 for url in url_list:
2614                         try:
2615                                 urllib2.urlopen(url)
2616                                 return url
2617                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618                                 url = None
2619
2620                 return None
2621
2622         def _print_formats(self, formats):
2623                 print 'Available formats:'
2624                 for fmt in formats.keys():
2625                         for b in formats[fmt]:
2626                                 try:
2627                                         ext = formats[fmt][b][0]
2628                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2629                                 except TypeError: # we have no bitrate info
2630                                         ext = formats[fmt][0]
2631                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2632                                         break
2633
2634         def _real_extract(self, url):
2635                 mobj = re.match(self._VALID_URL, url)
2636                 if mobj is None:
2637                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2638                         return
2639                 # extract uploader & filename from url
2640                 uploader = mobj.group(1).decode('utf-8')
2641                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2642
2643                 # construct API request
2644                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2645                 # retrieve .json file with links to files
2646                 request = urllib2.Request(file_url)
2647                 try:
2648                         self.report_download_json(file_url)
2649                         jsonData = urllib2.urlopen(request).read()
2650                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2651                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2652                         return
2653
2654                 # parse JSON
2655                 json_data = json.loads(jsonData)
2656                 player_url = json_data['player_swf_url']
2657                 formats = dict(json_data['audio_formats'])
2658
2659                 req_format = self._downloader.params.get('format', None)
2660                 bitrate = None
2661
2662                 if self._downloader.params.get('listformats', None):
2663                         self._print_formats(formats)
2664                         return
2665
2666                 if req_format is None or req_format == 'best':
2667                         for format_param in formats.keys():
2668                                 url_list = self.get_urls(formats, format_param)
2669                                 # check urls
2670                                 file_url = self.check_urls(url_list)
2671                                 if file_url is not None:
2672                                         break # got it!
2673                 else:
2674                         if req_format not in formats.keys():
2675                                 self._downloader.trouble(u'ERROR: format is not available')
2676                                 return
2677
2678                         url_list = self.get_urls(formats, req_format)
2679                         file_url = self.check_urls(url_list)
2680                         format_param = req_format
2681
2682                 return [{
2683                         'id': file_id.decode('utf-8'),
2684                         'url': file_url.decode('utf-8'),
2685                         'uploader':     uploader.decode('utf-8'),
2686                         'upload_date': u'NA',
2687                         'title': json_data['name'],
2688                         'stitle': simplify_title(json_data['name']),
2689                         'ext': file_url.split('.')[-1].decode('utf-8'),
2690                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2691                         'thumbnail': json_data['thumbnail_url'],
2692                         'description': json_data['description'],
2693                         'player_url': player_url.decode('utf-8'),
2694                 }]
2695
2696 class StanfordOpenClassroomIE(InfoExtractor):
2697         """Information extractor for Stanford's Open ClassRoom"""
2698
2699         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2700         IE_NAME = u'stanfordoc'
2701
2702         def report_download_webpage(self, objid):
2703                 """Report information extraction."""
2704                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2705
2706         def report_extraction(self, video_id):
2707                 """Report information extraction."""
2708                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2709
2710         def _real_extract(self, url):
2711                 mobj = re.match(self._VALID_URL, url)
2712                 if mobj is None:
2713                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2714                         return
2715
2716                 if mobj.group('course') and mobj.group('video'): # A specific video
2717                         course = mobj.group('course')
2718                         video = mobj.group('video')
2719                         info = {
2720                                 'id': simplify_title(course + '_' + video),
2721                         }
2722
2723                         self.report_extraction(info['id'])
2724                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2725                         xmlUrl = baseUrl + video + '.xml'
2726                         try:
2727                                 metaXml = urllib2.urlopen(xmlUrl).read()
2728                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2729                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2730                                 return
2731                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2732                         try:
2733                                 info['title'] = mdoc.findall('./title')[0].text
2734                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2735                         except IndexError:
2736                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2737                                 return
2738                         info['stitle'] = simplify_title(info['title'])
2739                         info['ext'] = info['url'].rpartition('.')[2]
2740                         info['format'] = info['ext']
2741                         return [info]
2742                 elif mobj.group('course'): # A course page
2743                         course = mobj.group('course')
2744                         info = {
2745                                 'id': simplify_title(course),
2746                                 'type': 'playlist',
2747                         }
2748
2749                         self.report_download_webpage(info['id'])
2750                         try:
2751                                 coursepage = urllib2.urlopen(url).read()
2752                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2754                                 return
2755
2756                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2757                         if m:
2758                                 info['title'] = unescapeHTML(m.group(1))
2759                         else:
2760                                 info['title'] = info['id']
2761                         info['stitle'] = simplify_title(info['title'])
2762
2763                         m = re.search('<description>([^<]+)</description>', coursepage)
2764                         if m:
2765                                 info['description'] = unescapeHTML(m.group(1))
2766
2767                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2768                         info['list'] = [
2769                                 {
2770                                         'type': 'reference',
2771                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2772                                 }
2773                                         for vpage in links]
2774                         results = []
2775                         for entry in info['list']:
2776                                 assert entry['type'] == 'reference'
2777                                 results += self.extract(entry['url'])
2778                         return results
2779
2780                 else: # Root page
2781                         info = {
2782                                 'id': 'Stanford OpenClassroom',
2783                                 'type': 'playlist',
2784                         }
2785
2786                         self.report_download_webpage(info['id'])
2787                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2788                         try:
2789                                 rootpage = urllib2.urlopen(rootURL).read()
2790                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2792                                 return
2793
2794                         info['title'] = info['id']
2795                         info['stitle'] = simplify_title(info['title'])
2796
2797                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2798                         info['list'] = [
2799                                 {
2800                                         'type': 'reference',
2801                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2802                                 }
2803                                         for cpage in links]
2804
2805                         results = []
2806                         for entry in info['list']:
2807                                 assert entry['type'] == 'reference'
2808                                 results += self.extract(entry['url'])
2809                         return results
2810
2811 class MTVIE(InfoExtractor):
2812         """Information extractor for MTV.com"""
2813
2814         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2815         IE_NAME = u'mtv'
2816
2817         def report_webpage(self, video_id):
2818                 """Report information extraction."""
2819                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2820
2821         def report_extraction(self, video_id):
2822                 """Report information extraction."""
2823                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2824
2825         def _real_extract(self, url):
2826                 mobj = re.match(self._VALID_URL, url)
2827                 if mobj is None:
2828                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2829                         return
2830                 if not mobj.group('proto'):
2831                         url = 'http://' + url
2832                 video_id = mobj.group('videoid')
2833                 self.report_webpage(video_id)
2834
2835                 request = urllib2.Request(url)
2836                 try:
2837                         webpage = urllib2.urlopen(request).read()
2838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2840                         return
2841
2842                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2843                 if mobj is None:
2844                         self._downloader.trouble(u'ERROR: unable to extract song name')
2845                         return
2846                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2847                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2848                 if mobj is None:
2849                         self._downloader.trouble(u'ERROR: unable to extract performer')
2850                         return
2851                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2852                 video_title = performer + ' - ' + song_name
2853
2854                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2855                 if mobj is None:
2856                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2857                         return
2858                 mtvn_uri = mobj.group(1)
2859
2860                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2861                 if mobj is None:
2862                         self._downloader.trouble(u'ERROR: unable to extract content id')
2863                         return
2864                 content_id = mobj.group(1)
2865
2866                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2867                 self.report_extraction(video_id)
2868                 request = urllib2.Request(videogen_url)
2869                 try:
2870                         metadataXml = urllib2.urlopen(request).read()
2871                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2873                         return
2874
2875                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2876                 renditions = mdoc.findall('.//rendition')
2877
2878                 # For now, always pick the highest quality.
2879                 rendition = renditions[-1]
2880
2881                 try:
2882                         _,_,ext = rendition.attrib['type'].partition('/')
2883                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2884                         video_url = rendition.find('./src').text
2885                 except KeyError:
2886                         self._downloader.trouble('Invalid rendition field.')
2887                         return
2888
2889                 info = {
2890                         'id': video_id,
2891                         'url': video_url,
2892                         'uploader': performer,
2893                         'title': video_title,
2894                         'stitle': simplify_title(video_title),
2895                         'ext': ext,
2896                         'format': format,
2897                 }
2898
2899                 return [info]