_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 # parse_qs was moved from the cgi module to the urlparse module recently.
  22 try:
  23         from urlparse import parse_qs
  24 except ImportError:
  25         from cgi import parse_qs
  26
  27 try:
  28         import xml.etree.ElementTree
  29 except ImportError: # Python<2.5: Not officially supported, but let it slip
  30         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  31
  32 from utils import *
  33
  34
  35 class InfoExtractor(object):
  36         """Information Extractor class.
  37
  38         Information extractors are the classes that, given a URL, extract
  39         information from the video (or videos) the URL refers to. This
  40         information includes the real video URL, the video title and simplified
  41         title, author and others. The information is stored in a dictionary
  42         which is then passed to the FileDownloader. The FileDownloader
  43         processes this information possibly downloading the video to the file
  44         system, among other possible outcomes. The dictionaries must include
  45         the following fields:
  46
  47         id:             Video identifier.
  48         url:            Final video URL.
  49         uploader:       Nickname of the video uploader.
  50         title:          Literal title.
  51         stitle:         Simplified title.
  52         ext:            Video filename extension.
  53         format:         Video format.
  54         player_url:     SWF Player URL (may be None).
  55
  56         The following fields are optional. Their primary purpose is to allow
  57         youtube-dl to serve as the backend for a video search function, such
  58         as the one in youtube2mp3.  They are only used when their respective
  59         forced printing functions are called:
  60
  61         thumbnail:      Full URL to a video thumbnail image.
  62         description:    One-line video description.
  63
  64         Subclasses of this one should re-define the _real_initialize() and
  65         _real_extract() methods and define a _VALID_URL regexp.
  66         Probably, they should also be added to the list of extractors.
  67         """
  68
  69         _ready = False
  70         _downloader = None
  71
  72         def __init__(self, downloader=None):
  73                 """Constructor. Receives an optional downloader."""
  74                 self._ready = False
  75                 self.set_downloader(downloader)
  76
  77         def suitable(self, url):
  78                 """Receives a URL and returns True if suitable for this IE."""
  79                 return re.match(self._VALID_URL, url) is not None
  80
  81         def initialize(self):
  82                 """Initializes an instance (authentication, etc)."""
  83                 if not self._ready:
  84                         self._real_initialize()
  85                         self._ready = True
  86
  87         def extract(self, url):
  88                 """Extracts URL information and returns it in list of dicts."""
  89                 self.initialize()
  90                 return self._real_extract(url)
  91
  92         def set_downloader(self, downloader):
  93                 """Sets the downloader for this IE."""
  94                 self._downloader = downloader
  95
  96         def _real_initialize(self):
  97                 """Real initialization process. Redefine in subclasses."""
  98                 pass
  99
 100         def _real_extract(self, url):
 101                 """Real extraction process. Redefine in subclasses."""
 102                 pass
 103
 104
 105 class YoutubeIE(InfoExtractor):
 106         """Information extractor for youtube.com."""
 107
 108         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 109         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 110         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 111         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 112         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 113         _NETRC_MACHINE = 'youtube'
 114         # Listed in order of quality
 115         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 116         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 117         _video_extensions = {
 118                 '13': '3gp',
 119                 '17': 'mp4',
 120                 '18': 'mp4',
 121                 '22': 'mp4',
 122                 '37': 'mp4',
 123                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 124                 '43': 'webm',
 125                 '44': 'webm',
 126                 '45': 'webm',
 127         }
 128         _video_dimensions = {
 129                 '5': '240x400',
 130                 '6': '???',
 131                 '13': '???',
 132                 '17': '144x176',
 133                 '18': '360x640',
 134                 '22': '720x1280',
 135                 '34': '360x640',
 136                 '35': '480x854',
 137                 '37': '1080x1920',
 138                 '38': '3072x4096',
 139                 '43': '360x640',
 140                 '44': '480x854',
 141                 '45': '720x1280',
 142         }
 143         IE_NAME = u'youtube'
 144
 145         def report_lang(self):
 146                 """Report attempt to set language."""
 147                 self._downloader.to_screen(u'[youtube] Setting language')
 148
 149         def report_login(self):
 150                 """Report attempt to log in."""
 151                 self._downloader.to_screen(u'[youtube] Logging in')
 152
 153         def report_age_confirmation(self):
 154                 """Report attempt to confirm age."""
 155                 self._downloader.to_screen(u'[youtube] Confirming age')
 156
 157         def report_video_webpage_download(self, video_id):
 158                 """Report attempt to download video webpage."""
 159                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 160
 161         def report_video_info_webpage_download(self, video_id):
 162                 """Report attempt to download video info webpage."""
 163                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 164
 165         def report_video_subtitles_download(self, video_id):
 166                 """Report attempt to download video info webpage."""
 167                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 168
 169         def report_information_extraction(self, video_id):
 170                 """Report attempt to extract video information."""
 171                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 172
 173         def report_unavailable_format(self, video_id, format):
 174                 """Report extracted video URL."""
 175                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 176
 177         def report_rtmp_download(self):
 178                 """Indicate the download will use the RTMP protocol."""
 179                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 180
 181         def _closed_captions_xml_to_srt(self, xml_string):
 182                 srt = ''
 183                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 184                 # TODO parse xml instead of regex
 185                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 186                         if not dur: dur = '4'
 187                         start = float(start)
 188                         end = start + float(dur)
 189                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 190                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 191                         caption = unescapeHTML(caption)
 192                         caption = unescapeHTML(caption) # double cycle, inentional
 193                         srt += str(n) + '\n'
 194                         srt += start + ' --> ' + end + '\n'
 195                         srt += caption + '\n\n'
 196                 return srt
 197
 198         def _print_formats(self, formats):
 199                 print 'Available formats:'
 200                 for x in formats:
 201                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 202
 203         def _real_initialize(self):
 204                 if self._downloader is None:
 205                         return
 206
 207                 username = None
 208                 password = None
 209                 downloader_params = self._downloader.params
 210
 211                 # Attempt to use provided username and password or .netrc data
 212                 if downloader_params.get('username', None) is not None:
 213                         username = downloader_params['username']
 214                         password = downloader_params['password']
 215                 elif downloader_params.get('usenetrc', False):
 216                         try:
 217                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 218                                 if info is not None:
 219                                         username = info[0]
 220                                         password = info[2]
 221                                 else:
 222                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 223                         except (IOError, netrc.NetrcParseError), err:
 224                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 225                                 return
 226
 227                 # Set language
 228                 request = urllib2.Request(self._LANG_URL)
 229                 try:
 230                         self.report_lang()
 231                         urllib2.urlopen(request).read()
 232                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 233                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 234                         return
 235
 236                 # No authentication to be performed
 237                 if username is None:
 238                         return
 239
 240                 # Log in
 241                 login_form = {
 242                                 'current_form': 'loginForm',
 243                                 'next':         '/',
 244                                 'action_login': 'Log In',
 245                                 'username':     username,
 246                                 'password':     password,
 247                                 }
 248                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 249                 try:
 250                         self.report_login()
 251                         login_results = urllib2.urlopen(request).read()
 252                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 253                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 254                                 return
 255                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 256                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 257                         return
 258
 259                 # Confirm age
 260                 age_form = {
 261                                 'next_url':             '/',
 262                                 'action_confirm':       'Confirm',
 263                                 }
 264                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 265                 try:
 266                         self.report_age_confirmation()
 267                         age_results = urllib2.urlopen(request).read()
 268                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 269                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 270                         return
 271
 272         def _real_extract(self, url):
 273                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 274                 mobj = re.search(self._NEXT_URL_RE, url)
 275                 if mobj:
 276                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 277
 278                 # Extract video id from URL
 279                 mobj = re.match(self._VALID_URL, url)
 280                 if mobj is None:
 281                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 282                         return
 283                 video_id = mobj.group(2)
 284
 285                 # Get video webpage
 286                 self.report_video_webpage_download(video_id)
 287                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 288                 try:
 289                         video_webpage = urllib2.urlopen(request).read()
 290                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 291                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 292                         return
 293
 294                 # Attempt to extract SWF player URL
 295                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 296                 if mobj is not None:
 297                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 298                 else:
 299                         player_url = None
 300
 301                 # Get video info
 302                 self.report_video_info_webpage_download(video_id)
 303                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 304                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 305                                         % (video_id, el_type))
 306                         request = urllib2.Request(video_info_url)
 307                         try:
 308                                 video_info_webpage = urllib2.urlopen(request).read()
 309                                 video_info = parse_qs(video_info_webpage)
 310                                 if 'token' in video_info:
 311                                         break
 312                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 313                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 314                                 return
 315                 if 'token' not in video_info:
 316                         if 'reason' in video_info:
 317                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 318                         else:
 319                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 320                         return
 321
 322                 # Start extracting information
 323                 self.report_information_extraction(video_id)
 324
 325                 # uploader
 326                 if 'author' not in video_info:
 327                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 328                         return
 329                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 330
 331                 # title
 332                 if 'title' not in video_info:
 333                         self._downloader.trouble(u'ERROR: unable to extract video title')
 334                         return
 335                 video_title = urllib.unquote_plus(video_info['title'][0])
 336                 video_title = video_title.decode('utf-8')
 337                 video_title = sanitize_title(video_title)
 338
 339                 # simplified title
 340                 simple_title = simplify_title(video_title)
 341
 342                 # thumbnail image
 343                 if 'thumbnail_url' not in video_info:
 344                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 345                         video_thumbnail = ''
 346                 else:   # don't panic if we can't find it
 347                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 348
 349                 # upload date
 350                 upload_date = u'NA'
 351                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 352                 if mobj is not None:
 353                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 354                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 355                         for expression in format_expressions:
 356                                 try:
 357                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 358                                 except:
 359                                         pass
 360
 361                 # description
 362                 video_description = get_element_by_id("eow-description", video_webpage)
 363                 if video_description: video_description = clean_html(video_description.decode('utf8'))
 364                 else: video_description = ''
 365
 366                 # closed captions
 367                 video_subtitles = None
 368                 if self._downloader.params.get('writesubtitles', False):
 369                         self.report_video_subtitles_download(video_id)
 370                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 371                         try:
 372                                 srt_list = urllib2.urlopen(request).read()
 373                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 374                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 375                         else:
 376                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 377                                 if srt_lang_list:
 378                                         if self._downloader.params.get('subtitleslang', False):
 379                                                 srt_lang = self._downloader.params.get('subtitleslang')
 380                                         elif 'en' in srt_lang_list:
 381                                                 srt_lang = 'en'
 382                                         else:
 383                                                 srt_lang = srt_lang_list[0]
 384                                         if not srt_lang in srt_lang_list:
 385                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
 386                                         else:
 387                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 388                                                 try:
 389                                                         srt_xml = urllib2.urlopen(request).read()
 390                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 391                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 392                                                 else:
 393                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 394                                 else:
 395                                         self._downloader.trouble(u'WARNING: video has no closed captions')
 396
 397                 # token
 398                 video_token = urllib.unquote_plus(video_info['token'][0])
 399
 400                 # Decide which formats to download
 401                 req_format = self._downloader.params.get('format', None)
 402
 403                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 404                         self.report_rtmp_download()
 405                         video_url_list = [(None, video_info['conn'][0])]
 406                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 407                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 408                         url_data = [parse_qs(uds) for uds in url_data_strs]
 409                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 410                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 411
 412                         format_limit = self._downloader.params.get('format_limit', None)
 413                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 414                         if format_limit is not None and format_limit in available_formats:
 415                                 format_list = available_formats[available_formats.index(format_limit):]
 416                         else:
 417                                 format_list = available_formats
 418                         existing_formats = [x for x in format_list if x in url_map]
 419                         if len(existing_formats) == 0:
 420                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 421                                 return
 422                         if self._downloader.params.get('listformats', None):
 423                                 self._print_formats(existing_formats)
 424                                 return
 425                         if req_format is None or req_format == 'best':
 426                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 427                         elif req_format == 'worst':
 428                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 429                         elif req_format in ('-1', 'all'):
 430                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 431                         else:
 432                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 433                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 434                                 req_formats = req_format.split('/')
 435                                 video_url_list = None
 436                                 for rf in req_formats:
 437                                         if rf in url_map:
 438                                                 video_url_list = [(rf, url_map[rf])]
 439                                                 break
 440                                 if video_url_list is None:
 441                                         self._downloader.trouble(u'ERROR: requested format not available')
 442                                         return
 443                 else:
 444                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 445                         return
 446
 447                 results = []
 448                 for format_param, video_real_url in video_url_list:
 449                         # Extension
 450                         video_extension = self._video_extensions.get(format_param, 'flv')
 451
 452                         results.append({
 453                                 'id':           video_id.decode('utf-8'),
 454                                 'url':          video_real_url.decode('utf-8'),
 455                                 'uploader':     video_uploader.decode('utf-8'),
 456                                 'upload_date':  upload_date,
 457                                 'title':        video_title,
 458                                 'stitle':       simple_title,
 459                                 'ext':          video_extension.decode('utf-8'),
 460                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 461                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 462                                 'description':  video_description,
 463                                 'player_url':   player_url,
 464                                 'subtitles':    video_subtitles
 465                         })
 466                 return results
 467
 468
 469 class MetacafeIE(InfoExtractor):
 470         """Information Extractor for metacafe.com."""
 471
 472         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 473         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 474         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 475         IE_NAME = u'metacafe'
 476
 477         def __init__(self, downloader=None):
 478                 InfoExtractor.__init__(self, downloader)
 479
 480         def report_disclaimer(self):
 481                 """Report disclaimer retrieval."""
 482                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 483
 484         def report_age_confirmation(self):
 485                 """Report attempt to confirm age."""
 486                 self._downloader.to_screen(u'[metacafe] Confirming age')
 487
 488         def report_download_webpage(self, video_id):
 489                 """Report webpage download."""
 490                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 491
 492         def report_extraction(self, video_id):
 493                 """Report information extraction."""
 494                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 495
 496         def _real_initialize(self):
 497                 # Retrieve disclaimer
 498                 request = urllib2.Request(self._DISCLAIMER)
 499                 try:
 500                         self.report_disclaimer()
 501                         disclaimer = urllib2.urlopen(request).read()
 502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 503                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 504                         return
 505
 506                 # Confirm age
 507                 disclaimer_form = {
 508                         'filters': '0',
 509                         'submit': "Continue - I'm over 18",
 510                         }
 511                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 512                 try:
 513                         self.report_age_confirmation()
 514                         disclaimer = urllib2.urlopen(request).read()
 515                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 516                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 517                         return
 518
 519         def _real_extract(self, url):
 520                 # Extract id and simplified title from URL
 521                 mobj = re.match(self._VALID_URL, url)
 522                 if mobj is None:
 523                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 524                         return
 525
 526                 video_id = mobj.group(1)
 527
 528                 # Check if video comes from YouTube
 529                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 530                 if mobj2 is not None:
 531                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 532                         return
 533
 534                 simple_title = mobj.group(2).decode('utf-8')
 535
 536                 # Retrieve video webpage to extract further information
 537                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 538                 try:
 539                         self.report_download_webpage(video_id)
 540                         webpage = urllib2.urlopen(request).read()
 541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 542                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 543                         return
 544
 545                 # Extract URL, uploader and title from webpage
 546                 self.report_extraction(video_id)
 547                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 548                 if mobj is not None:
 549                         mediaURL = urllib.unquote(mobj.group(1))
 550                         video_extension = mediaURL[-3:]
 551
 552                         # Extract gdaKey if available
 553                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 554                         if mobj is None:
 555                                 video_url = mediaURL
 556                         else:
 557                                 gdaKey = mobj.group(1)
 558                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 559                 else:
 560                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 561                         if mobj is None:
 562                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 563                                 return
 564                         vardict = parse_qs(mobj.group(1))
 565                         if 'mediaData' not in vardict:
 566                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 567                                 return
 568                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 569                         if mobj is None:
 570                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 571                                 return
 572                         mediaURL = mobj.group(1).replace('\\/', '/')
 573                         video_extension = mediaURL[-3:]
 574                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 575
 576                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 577                 if mobj is None:
 578                         self._downloader.trouble(u'ERROR: unable to extract title')
 579                         return
 580                 video_title = mobj.group(1).decode('utf-8')
 581                 video_title = sanitize_title(video_title)
 582
 583                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 584                 if mobj is None:
 585                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 586                         return
 587                 video_uploader = mobj.group(1)
 588
 589                 return [{
 590                         'id':           video_id.decode('utf-8'),
 591                         'url':          video_url.decode('utf-8'),
 592                         'uploader':     video_uploader.decode('utf-8'),
 593                         'upload_date':  u'NA',
 594                         'title':        video_title,
 595                         'stitle':       simple_title,
 596                         'ext':          video_extension.decode('utf-8'),
 597                         'format':       u'NA',
 598                         'player_url':   None,
 599                 }]
 600
 601
 602 class DailymotionIE(InfoExtractor):
 603         """Information Extractor for Dailymotion"""
 604
 605         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 606         IE_NAME = u'dailymotion'
 607
 608         def __init__(self, downloader=None):
 609                 InfoExtractor.__init__(self, downloader)
 610
 611         def report_download_webpage(self, video_id):
 612                 """Report webpage download."""
 613                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 614
 615         def report_extraction(self, video_id):
 616                 """Report information extraction."""
 617                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 618
 619         def _real_extract(self, url):
 620                 # Extract id and simplified title from URL
 621                 mobj = re.match(self._VALID_URL, url)
 622                 if mobj is None:
 623                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 624                         return
 625
 626                 video_id = mobj.group(1)
 627
 628                 video_extension = 'flv'
 629
 630                 # Retrieve video webpage to extract further information
 631                 request = urllib2.Request(url)
 632                 request.add_header('Cookie', 'family_filter=off')
 633                 try:
 634                         self.report_download_webpage(video_id)
 635                         webpage = urllib2.urlopen(request).read()
 636                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 637                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 638                         return
 639
 640                 # Extract URL, uploader and title from webpage
 641                 self.report_extraction(video_id)
 642                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 643                 if mobj is None:
 644                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 645                         return
 646                 sequence = urllib.unquote(mobj.group(1))
 647                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 648                 if mobj is None:
 649                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 650                         return
 651                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 652
 653                 # if needed add http://www.dailymotion.com/ if relative URL
 654
 655                 video_url = mediaURL
 656
 657                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 658                 if mobj is None:
 659                         self._downloader.trouble(u'ERROR: unable to extract title')
 660                         return
 661                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 662                 video_title = sanitize_title(video_title)
 663                 simple_title = simplify_title(video_title)
 664
 665                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 666                 if mobj is None:
 667                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 668                         return
 669                 video_uploader = mobj.group(1)
 670
 671                 return [{
 672                         'id':           video_id.decode('utf-8'),
 673                         'url':          video_url.decode('utf-8'),
 674                         'uploader':     video_uploader.decode('utf-8'),
 675                         'upload_date':  u'NA',
 676                         'title':        video_title,
 677                         'stitle':       simple_title,
 678                         'ext':          video_extension.decode('utf-8'),
 679                         'format':       u'NA',
 680                         'player_url':   None,
 681                 }]
 682
 683
 684 class GoogleIE(InfoExtractor):
 685         """Information extractor for video.google.com."""
 686
 687         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 688         IE_NAME = u'video.google'
 689
 690         def __init__(self, downloader=None):
 691                 InfoExtractor.__init__(self, downloader)
 692
 693         def report_download_webpage(self, video_id):
 694                 """Report webpage download."""
 695                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 696
 697         def report_extraction(self, video_id):
 698                 """Report information extraction."""
 699                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 700
 701         def _real_extract(self, url):
 702                 # Extract id from URL
 703                 mobj = re.match(self._VALID_URL, url)
 704                 if mobj is None:
 705                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 706                         return
 707
 708                 video_id = mobj.group(1)
 709
 710                 video_extension = 'mp4'
 711
 712                 # Retrieve video webpage to extract further information
 713                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 714                 try:
 715                         self.report_download_webpage(video_id)
 716                         webpage = urllib2.urlopen(request).read()
 717                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 718                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 719                         return
 720
 721                 # Extract URL, uploader, and title from webpage
 722                 self.report_extraction(video_id)
 723                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 724                 if mobj is None:
 725                         video_extension = 'flv'
 726                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 727                 if mobj is None:
 728                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 729                         return
 730                 mediaURL = urllib.unquote(mobj.group(1))
 731                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 732                 mediaURL = mediaURL.replace('\\x26', '\x26')
 733
 734                 video_url = mediaURL
 735
 736                 mobj = re.search(r'<title>(.*)</title>', webpage)
 737                 if mobj is None:
 738                         self._downloader.trouble(u'ERROR: unable to extract title')
 739                         return
 740                 video_title = mobj.group(1).decode('utf-8')
 741                 video_title = sanitize_title(video_title)
 742                 simple_title = simplify_title(video_title)
 743
 744                 # Extract video description
 745                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 746                 if mobj is None:
 747                         self._downloader.trouble(u'ERROR: unable to extract video description')
 748                         return
 749                 video_description = mobj.group(1).decode('utf-8')
 750                 if not video_description:
 751                         video_description = 'No description available.'
 752
 753                 # Extract video thumbnail
 754                 if self._downloader.params.get('forcethumbnail', False):
 755                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 756                         try:
 757                                 webpage = urllib2.urlopen(request).read()
 758                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 759                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 760                                 return
 761                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 762                         if mobj is None:
 763                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 764                                 return
 765                         video_thumbnail = mobj.group(1)
 766                 else:   # we need something to pass to process_info
 767                         video_thumbnail = ''
 768
 769                 return [{
 770                         'id':           video_id.decode('utf-8'),
 771                         'url':          video_url.decode('utf-8'),
 772                         'uploader':     u'NA',
 773                         'upload_date':  u'NA',
 774                         'title':        video_title,
 775                         'stitle':       simple_title,
 776                         'ext':          video_extension.decode('utf-8'),
 777                         'format':       u'NA',
 778                         'player_url':   None,
 779                 }]
 780
 781
 782 class PhotobucketIE(InfoExtractor):
 783         """Information extractor for photobucket.com."""
 784
 785         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 786         IE_NAME = u'photobucket'
 787
 788         def __init__(self, downloader=None):
 789                 InfoExtractor.__init__(self, downloader)
 790
 791         def report_download_webpage(self, video_id):
 792                 """Report webpage download."""
 793                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 794
 795         def report_extraction(self, video_id):
 796                 """Report information extraction."""
 797                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 798
 799         def _real_extract(self, url):
 800                 # Extract id from URL
 801                 mobj = re.match(self._VALID_URL, url)
 802                 if mobj is None:
 803                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 804                         return
 805
 806                 video_id = mobj.group(1)
 807
 808                 video_extension = 'flv'
 809
 810                 # Retrieve video webpage to extract further information
 811                 request = urllib2.Request(url)
 812                 try:
 813                         self.report_download_webpage(video_id)
 814                         webpage = urllib2.urlopen(request).read()
 815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 816                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 817                         return
 818
 819                 # Extract URL, uploader, and title from webpage
 820                 self.report_extraction(video_id)
 821                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 822                 if mobj is None:
 823                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 824                         return
 825                 mediaURL = urllib.unquote(mobj.group(1))
 826
 827                 video_url = mediaURL
 828
 829                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 830                 if mobj is None:
 831                         self._downloader.trouble(u'ERROR: unable to extract title')
 832                         return
 833                 video_title = mobj.group(1).decode('utf-8')
 834                 video_title = sanitize_title(video_title)
 835                 simple_title = simplify_title(video_title)
 836
 837                 video_uploader = mobj.group(2).decode('utf-8')
 838
 839                 return [{
 840                         'id':           video_id.decode('utf-8'),
 841                         'url':          video_url.decode('utf-8'),
 842                         'uploader':     video_uploader,
 843                         'upload_date':  u'NA',
 844                         'title':        video_title,
 845                         'stitle':       simple_title,
 846                         'ext':          video_extension.decode('utf-8'),
 847                         'format':       u'NA',
 848                         'player_url':   None,
 849                 }]
 850
 851
 852 class YahooIE(InfoExtractor):
 853         """Information extractor for video.yahoo.com."""
 854
 855         # _VALID_URL matches all Yahoo! Video URLs
 856         # _VPAGE_URL matches only the extractable '/watch/' URLs
 857         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 858         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 859         IE_NAME = u'video.yahoo'
 860
 861         def __init__(self, downloader=None):
 862                 InfoExtractor.__init__(self, downloader)
 863
 864         def report_download_webpage(self, video_id):
 865                 """Report webpage download."""
 866                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 867
 868         def report_extraction(self, video_id):
 869                 """Report information extraction."""
 870                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 871
 872         def _real_extract(self, url, new_video=True):
 873                 # Extract ID from URL
 874                 mobj = re.match(self._VALID_URL, url)
 875                 if mobj is None:
 876                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 877                         return
 878
 879                 video_id = mobj.group(2)
 880                 video_extension = 'flv'
 881
 882                 # Rewrite valid but non-extractable URLs as
 883                 # extractable English language /watch/ URLs
 884                 if re.match(self._VPAGE_URL, url) is None:
 885                         request = urllib2.Request(url)
 886                         try:
 887                                 webpage = urllib2.urlopen(request).read()
 888                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 889                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 890                                 return
 891
 892                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 893                         if mobj is None:
 894                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 895                                 return
 896                         yahoo_id = mobj.group(1)
 897
 898                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 899                         if mobj is None:
 900                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 901                                 return
 902                         yahoo_vid = mobj.group(1)
 903
 904                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 905                         return self._real_extract(url, new_video=False)
 906
 907                 # Retrieve video webpage to extract further information
 908                 request = urllib2.Request(url)
 909                 try:
 910                         self.report_download_webpage(video_id)
 911                         webpage = urllib2.urlopen(request).read()
 912                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 913                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 914                         return
 915
 916                 # Extract uploader and title from webpage
 917                 self.report_extraction(video_id)
 918                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 919                 if mobj is None:
 920                         self._downloader.trouble(u'ERROR: unable to extract video title')
 921                         return
 922                 video_title = mobj.group(1).decode('utf-8')
 923                 simple_title = simplify_title(video_title)
 924
 925                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 926                 if mobj is None:
 927                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 928                         return
 929                 video_uploader = mobj.group(1).decode('utf-8')
 930
 931                 # Extract video thumbnail
 932                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 933                 if mobj is None:
 934                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 935                         return
 936                 video_thumbnail = mobj.group(1).decode('utf-8')
 937
 938                 # Extract video description
 939                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: unable to extract video description')
 942                         return
 943                 video_description = mobj.group(1).decode('utf-8')
 944                 if not video_description:
 945                         video_description = 'No description available.'
 946
 947                 # Extract video height and width
 948                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 949                 if mobj is None:
 950                         self._downloader.trouble(u'ERROR: unable to extract video height')
 951                         return
 952                 yv_video_height = mobj.group(1)
 953
 954                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 955                 if mobj is None:
 956                         self._downloader.trouble(u'ERROR: unable to extract video width')
 957                         return
 958                 yv_video_width = mobj.group(1)
 959
 960                 # Retrieve video playlist to extract media URL
 961                 # I'm not completely sure what all these options are, but we
 962                 # seem to need most of them, otherwise the server sends a 401.
 963                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 964                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 965                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 966                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 967                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 968                 try:
 969                         self.report_download_webpage(video_id)
 970                         webpage = urllib2.urlopen(request).read()
 971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 972                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 973                         return
 974
 975                 # Extract media URL from playlist XML
 976                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 977                 if mobj is None:
 978                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 979                         return
 980                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 981                 video_url = unescapeHTML(video_url)
 982
 983                 return [{
 984                         'id':           video_id.decode('utf-8'),
 985                         'url':          video_url,
 986                         'uploader':     video_uploader,
 987                         'upload_date':  u'NA',
 988                         'title':        video_title,
 989                         'stitle':       simple_title,
 990                         'ext':          video_extension.decode('utf-8'),
 991                         'thumbnail':    video_thumbnail.decode('utf-8'),
 992                         'description':  video_description,
 993                         'thumbnail':    video_thumbnail,
 994                         'player_url':   None,
 995                 }]
 996
 997
 998 class VimeoIE(InfoExtractor):
 999         """Information extractor for vimeo.com."""
1000
1001         # _VALID_URL matches Vimeo URLs
1002         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1003         IE_NAME = u'vimeo'
1004
1005         def __init__(self, downloader=None):
1006                 InfoExtractor.__init__(self, downloader)
1007
1008         def report_download_webpage(self, video_id):
1009                 """Report webpage download."""
1010                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1011
1012         def report_extraction(self, video_id):
1013                 """Report information extraction."""
1014                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1015
1016         def _real_extract(self, url, new_video=True):
1017                 # Extract ID from URL
1018                 mobj = re.match(self._VALID_URL, url)
1019                 if mobj is None:
1020                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1021                         return
1022
1023                 video_id = mobj.group(1)
1024
1025                 # Retrieve video webpage to extract further information
1026                 request = urllib2.Request(url, None, std_headers)
1027                 try:
1028                         self.report_download_webpage(video_id)
1029                         webpage = urllib2.urlopen(request).read()
1030                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1032                         return
1033
1034                 # Now we begin extracting as much information as we can from what we
1035                 # retrieved. First we extract the information common to all extractors,
1036                 # and latter we extract those that are Vimeo specific.
1037                 self.report_extraction(video_id)
1038
1039                 # Extract the config JSON
1040                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1041                 try:
1042                         config = json.loads(config)
1043                 except:
1044                         self._downloader.trouble(u'ERROR: unable to extract info section')
1045                         return
1046
1047                 # Extract title
1048                 video_title = config["video"]["title"]
1049                 simple_title = simplify_title(video_title)
1050
1051                 # Extract uploader
1052                 video_uploader = config["video"]["owner"]["name"]
1053
1054                 # Extract video thumbnail
1055                 video_thumbnail = config["video"]["thumbnail"]
1056
1057                 # Extract video description
1058                 video_description = get_element_by_id("description", webpage)
1059                 if video_description: video_description = clean_html(video_description.decode('utf8'))
1060                 else: video_description = ''
1061
1062                 # Extract upload date
1063                 video_upload_date = u'NA'
1064                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1065                 if mobj is not None:
1066                         video_upload_date = mobj.group(1)
1067
1068                 # Vimeo specific: extract request signature and timestamp
1069                 sig = config['request']['signature']
1070                 timestamp = config['request']['timestamp']
1071
1072                 # Vimeo specific: extract video codec and quality information
1073                 # TODO bind to format param
1074                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1075                 for codec in codecs:
1076                         if codec[0] in config["video"]["files"]:
1077                                 video_codec = codec[0]
1078                                 video_extension = codec[1]
1079                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1080                                 else: quality = 'sd'
1081                                 break
1082                 else:
1083                         self._downloader.trouble(u'ERROR: no known codec found')
1084                         return
1085
1086                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1087                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1088
1089                 return [{
1090                         'id':           video_id,
1091                         'url':          video_url,
1092                         'uploader':     video_uploader,
1093                         'upload_date':  video_upload_date,
1094                         'title':        video_title,
1095                         'stitle':       simple_title,
1096                         'ext':          video_extension,
1097                         'thumbnail':    video_thumbnail,
1098                         'description':  video_description,
1099                         'player_url':   None,
1100                 }]
1101
1102
1103 class GenericIE(InfoExtractor):
1104         """Generic last-resort information extractor."""
1105
1106         _VALID_URL = r'.*'
1107         IE_NAME = u'generic'
1108
1109         def __init__(self, downloader=None):
1110                 InfoExtractor.__init__(self, downloader)
1111
1112         def report_download_webpage(self, video_id):
1113                 """Report webpage download."""
1114                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1115                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1116
1117         def report_extraction(self, video_id):
1118                 """Report information extraction."""
1119                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1120
1121         def report_following_redirect(self, new_url):
1122                 """Report information extraction."""
1123                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1124
1125         def _test_redirect(self, url):
1126                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1127                 class HeadRequest(urllib2.Request):
1128                         def get_method(self):
1129                                 return "HEAD"
1130
1131                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1132                         """
1133                         Subclass the HTTPRedirectHandler to make it use our
1134                         HeadRequest also on the redirected URL
1135                         """
1136                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1137                                 if code in (301, 302, 303, 307):
1138                                         newurl = newurl.replace(' ', '%20')
1139                                         newheaders = dict((k,v) for k,v in req.headers.items()
1140                                                                           if k.lower() not in ("content-length", "content-type"))
1141                                         return HeadRequest(newurl,
1142                                                                            headers=newheaders,
1143                                                                            origin_req_host=req.get_origin_req_host(),
1144                                                                            unverifiable=True)
1145                                 else:
1146                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1147
1148                 class HTTPMethodFallback(urllib2.BaseHandler):
1149                         """
1150                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1151                         """
1152                         def http_error_405(self, req, fp, code, msg, headers):
1153                                 fp.read()
1154                                 fp.close()
1155
1156                                 newheaders = dict((k,v) for k,v in req.headers.items()
1157                                                                   if k.lower() not in ("content-length", "content-type"))
1158                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1159                                                                                                  headers=newheaders,
1160                                                                                                  origin_req_host=req.get_origin_req_host(),
1161                                                                                                  unverifiable=True))
1162
1163                 # Build our opener
1164                 opener = urllib2.OpenerDirector()
1165                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1166                                                 HTTPMethodFallback, HEADRedirectHandler,
1167                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1168                         opener.add_handler(handler())
1169
1170                 response = opener.open(HeadRequest(url))
1171                 new_url = response.geturl()
1172
1173                 if url == new_url: return False
1174
1175                 self.report_following_redirect(new_url)
1176                 self._downloader.download([new_url])
1177                 return True
1178
1179         def _real_extract(self, url):
1180                 if self._test_redirect(url): return
1181
1182                 video_id = url.split('/')[-1]
1183                 request = urllib2.Request(url)
1184                 try:
1185                         self.report_download_webpage(video_id)
1186                         webpage = urllib2.urlopen(request).read()
1187                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1188                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1189                         return
1190                 except ValueError, err:
1191                         # since this is the last-resort InfoExtractor, if
1192                         # this error is thrown, it'll be thrown here
1193                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1194                         return
1195
1196                 self.report_extraction(video_id)
1197                 # Start with something easy: JW Player in SWFObject
1198                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1199                 if mobj is None:
1200                         # Broaden the search a little bit
1201                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1204                         return
1205
1206                 # It's possible that one of the regexes
1207                 # matched, but returned an empty group:
1208                 if mobj.group(1) is None:
1209                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210                         return
1211
1212                 video_url = urllib.unquote(mobj.group(1))
1213                 video_id = os.path.basename(video_url)
1214
1215                 # here's a fun little line of code for you:
1216                 video_extension = os.path.splitext(video_id)[1][1:]
1217                 video_id = os.path.splitext(video_id)[0]
1218
1219                 # it's tempting to parse this further, but you would
1220                 # have to take into account all the variations like
1221                 #   Video Title - Site Name
1222                 #   Site Name | Video Title
1223                 #   Video Title - Tagline | Site Name
1224                 # and so on and so forth; it's just not practical
1225                 mobj = re.search(r'<title>(.*)</title>', webpage)
1226                 if mobj is None:
1227                         self._downloader.trouble(u'ERROR: unable to extract title')
1228                         return
1229                 video_title = mobj.group(1).decode('utf-8')
1230                 video_title = sanitize_title(video_title)
1231                 simple_title = simplify_title(video_title)
1232
1233                 # video uploader is domain name
1234                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1235                 if mobj is None:
1236                         self._downloader.trouble(u'ERROR: unable to extract title')
1237                         return
1238                 video_uploader = mobj.group(1).decode('utf-8')
1239
1240                 return [{
1241                         'id':           video_id.decode('utf-8'),
1242                         'url':          video_url.decode('utf-8'),
1243                         'uploader':     video_uploader,
1244                         'upload_date':  u'NA',
1245                         'title':        video_title,
1246                         'stitle':       simple_title,
1247                         'ext':          video_extension.decode('utf-8'),
1248                         'format':       u'NA',
1249                         'player_url':   None,
1250                 }]
1251
1252
1253 class YoutubeSearchIE(InfoExtractor):
1254         """Information Extractor for YouTube search queries."""
1255         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1256         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1257         _max_youtube_results = 1000
1258         IE_NAME = u'youtube:search'
1259
1260         def __init__(self, downloader=None):
1261                 InfoExtractor.__init__(self, downloader)
1262
1263         def report_download_page(self, query, pagenum):
1264                 """Report attempt to download playlist page with given number."""
1265                 query = query.decode(preferredencoding())
1266                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1267
1268         def _real_extract(self, query):
1269                 mobj = re.match(self._VALID_URL, query)
1270                 if mobj is None:
1271                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1272                         return
1273
1274                 prefix, query = query.split(':')
1275                 prefix = prefix[8:]
1276                 query = query.encode('utf-8')
1277                 if prefix == '':
1278                         self._download_n_results(query, 1)
1279                         return
1280                 elif prefix == 'all':
1281                         self._download_n_results(query, self._max_youtube_results)
1282                         return
1283                 else:
1284                         try:
1285                                 n = long(prefix)
1286                                 if n <= 0:
1287                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1288                                         return
1289                                 elif n > self._max_youtube_results:
1290                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1291                                         n = self._max_youtube_results
1292                                 self._download_n_results(query, n)
1293                                 return
1294                         except ValueError: # parsing prefix as integer fails
1295                                 self._download_n_results(query, 1)
1296                                 return
1297
1298         def _download_n_results(self, query, n):
1299                 """Downloads a specified number of results for a query"""
1300
1301                 video_ids = []
1302                 pagenum = 0
1303                 limit = n
1304
1305                 while (50 * pagenum) < limit:
1306                         self.report_download_page(query, pagenum+1)
1307                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1308                         request = urllib2.Request(result_url)
1309                         try:
1310                                 data = urllib2.urlopen(request).read()
1311                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1313                                 return
1314                         api_response = json.loads(data)['data']
1315
1316                         new_ids = list(video['id'] for video in api_response['items'])
1317                         video_ids += new_ids
1318
1319                         limit = min(n, api_response['totalItems'])
1320                         pagenum += 1
1321
1322                 if len(video_ids) > n:
1323                         video_ids = video_ids[:n]
1324                 for id in video_ids:
1325                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1326                 return
1327
1328
1329 class GoogleSearchIE(InfoExtractor):
1330         """Information Extractor for Google Video search queries."""
1331         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1332         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1333         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1334         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1335         _max_google_results = 1000
1336         IE_NAME = u'video.google:search'
1337
1338         def __init__(self, downloader=None):
1339                 InfoExtractor.__init__(self, downloader)
1340
1341         def report_download_page(self, query, pagenum):
1342                 """Report attempt to download playlist page with given number."""
1343                 query = query.decode(preferredencoding())
1344                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1345
1346         def _real_extract(self, query):
1347                 mobj = re.match(self._VALID_URL, query)
1348                 if mobj is None:
1349                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1350                         return
1351
1352                 prefix, query = query.split(':')
1353                 prefix = prefix[8:]
1354                 query = query.encode('utf-8')
1355                 if prefix == '':
1356                         self._download_n_results(query, 1)
1357                         return
1358                 elif prefix == 'all':
1359                         self._download_n_results(query, self._max_google_results)
1360                         return
1361                 else:
1362                         try:
1363                                 n = long(prefix)
1364                                 if n <= 0:
1365                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1366                                         return
1367                                 elif n > self._max_google_results:
1368                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1369                                         n = self._max_google_results
1370                                 self._download_n_results(query, n)
1371                                 return
1372                         except ValueError: # parsing prefix as integer fails
1373                                 self._download_n_results(query, 1)
1374                                 return
1375
1376         def _download_n_results(self, query, n):
1377                 """Downloads a specified number of results for a query"""
1378
1379                 video_ids = []
1380                 pagenum = 0
1381
1382                 while True:
1383                         self.report_download_page(query, pagenum)
1384                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1385                         request = urllib2.Request(result_url)
1386                         try:
1387                                 page = urllib2.urlopen(request).read()
1388                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1390                                 return
1391
1392                         # Extract video identifiers
1393                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1394                                 video_id = mobj.group(1)
1395                                 if video_id not in video_ids:
1396                                         video_ids.append(video_id)
1397                                         if len(video_ids) == n:
1398                                                 # Specified n videos reached
1399                                                 for id in video_ids:
1400                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1401                                                 return
1402
1403                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1404                                 for id in video_ids:
1405                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1406                                 return
1407
1408                         pagenum = pagenum + 1
1409
1410
1411 class YahooSearchIE(InfoExtractor):
1412         """Information Extractor for Yahoo! Video search queries."""
1413         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1414         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1415         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1416         _MORE_PAGES_INDICATOR = r'\s*Next'
1417         _max_yahoo_results = 1000
1418         IE_NAME = u'video.yahoo:search'
1419
1420         def __init__(self, downloader=None):
1421                 InfoExtractor.__init__(self, downloader)
1422
1423         def report_download_page(self, query, pagenum):
1424                 """Report attempt to download playlist page with given number."""
1425                 query = query.decode(preferredencoding())
1426                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1427
1428         def _real_extract(self, query):
1429                 mobj = re.match(self._VALID_URL, query)
1430                 if mobj is None:
1431                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1432                         return
1433
1434                 prefix, query = query.split(':')
1435                 prefix = prefix[8:]
1436                 query = query.encode('utf-8')
1437                 if prefix == '':
1438                         self._download_n_results(query, 1)
1439                         return
1440                 elif prefix == 'all':
1441                         self._download_n_results(query, self._max_yahoo_results)
1442                         return
1443                 else:
1444                         try:
1445                                 n = long(prefix)
1446                                 if n <= 0:
1447                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1448                                         return
1449                                 elif n > self._max_yahoo_results:
1450                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1451                                         n = self._max_yahoo_results
1452                                 self._download_n_results(query, n)
1453                                 return
1454                         except ValueError: # parsing prefix as integer fails
1455                                 self._download_n_results(query, 1)
1456                                 return
1457
1458         def _download_n_results(self, query, n):
1459                 """Downloads a specified number of results for a query"""
1460
1461                 video_ids = []
1462                 already_seen = set()
1463                 pagenum = 1
1464
1465                 while True:
1466                         self.report_download_page(query, pagenum)
1467                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1468                         request = urllib2.Request(result_url)
1469                         try:
1470                                 page = urllib2.urlopen(request).read()
1471                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1473                                 return
1474
1475                         # Extract video identifiers
1476                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1477                                 video_id = mobj.group(1)
1478                                 if video_id not in already_seen:
1479                                         video_ids.append(video_id)
1480                                         already_seen.add(video_id)
1481                                         if len(video_ids) == n:
1482                                                 # Specified n videos reached
1483                                                 for id in video_ids:
1484                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1485                                                 return
1486
1487                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1488                                 for id in video_ids:
1489                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1490                                 return
1491
1492                         pagenum = pagenum + 1
1493
1494
1495 class YoutubePlaylistIE(InfoExtractor):
1496         """Information Extractor for YouTube playlists."""
1497
1498         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1499         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1500         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1501         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1502         IE_NAME = u'youtube:playlist'
1503
1504         def __init__(self, downloader=None):
1505                 InfoExtractor.__init__(self, downloader)
1506
1507         def report_download_page(self, playlist_id, pagenum):
1508                 """Report attempt to download playlist page with given number."""
1509                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1510
1511         def _real_extract(self, url):
1512                 # Extract playlist id
1513                 mobj = re.match(self._VALID_URL, url)
1514                 if mobj is None:
1515                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1516                         return
1517
1518                 # Single video case
1519                 if mobj.group(3) is not None:
1520                         self._downloader.download([mobj.group(3)])
1521                         return
1522
1523                 # Download playlist pages
1524                 # prefix is 'p' as default for playlists but there are other types that need extra care
1525                 playlist_prefix = mobj.group(1)
1526                 if playlist_prefix == 'a':
1527                         playlist_access = 'artist'
1528                 else:
1529                         playlist_prefix = 'p'
1530                         playlist_access = 'view_play_list'
1531                 playlist_id = mobj.group(2)
1532                 video_ids = []
1533                 pagenum = 1
1534
1535                 while True:
1536                         self.report_download_page(playlist_id, pagenum)
1537                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1538                         request = urllib2.Request(url)
1539                         try:
1540                                 page = urllib2.urlopen(request).read()
1541                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1542                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1543                                 return
1544
1545                         # Extract video identifiers
1546                         ids_in_page = []
1547                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1548                                 if mobj.group(1) not in ids_in_page:
1549                                         ids_in_page.append(mobj.group(1))
1550                         video_ids.extend(ids_in_page)
1551
1552                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1553                                 break
1554                         pagenum = pagenum + 1
1555
1556                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1557                 playlistend = self._downloader.params.get('playlistend', -1)
1558                 if playlistend == -1:
1559                         video_ids = video_ids[playliststart:]
1560                 else:
1561                         video_ids = video_ids[playliststart:playlistend]
1562
1563                 for id in video_ids:
1564                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565                 return
1566
1567
1568 class YoutubeUserIE(InfoExtractor):
1569         """Information Extractor for YouTube users."""
1570
1571         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1572         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1573         _GDATA_PAGE_SIZE = 50
1574         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1575         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1576         IE_NAME = u'youtube:user'
1577
1578         def __init__(self, downloader=None):
1579                 InfoExtractor.__init__(self, downloader)
1580
1581         def report_download_page(self, username, start_index):
1582                 """Report attempt to download user page."""
1583                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1584                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1585
1586         def _real_extract(self, url):
1587                 # Extract username
1588                 mobj = re.match(self._VALID_URL, url)
1589                 if mobj is None:
1590                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1591                         return
1592
1593                 username = mobj.group(1)
1594
1595                 # Download video ids using YouTube Data API. Result size per
1596                 # query is limited (currently to 50 videos) so we need to query
1597                 # page by page until there are no video ids - it means we got
1598                 # all of them.
1599
1600                 video_ids = []
1601                 pagenum = 0
1602
1603                 while True:
1604                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1605                         self.report_download_page(username, start_index)
1606
1607                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1608
1609                         try:
1610                                 page = urllib2.urlopen(request).read()
1611                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1613                                 return
1614
1615                         # Extract video identifiers
1616                         ids_in_page = []
1617
1618                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1619                                 if mobj.group(1) not in ids_in_page:
1620                                         ids_in_page.append(mobj.group(1))
1621
1622                         video_ids.extend(ids_in_page)
1623
1624                         # A little optimization - if current page is not
1625                         # "full", ie. does not contain PAGE_SIZE video ids then
1626                         # we can assume that this page is the last one - there
1627                         # are no more ids on further pages - no need to query
1628                         # again.
1629
1630                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1631                                 break
1632
1633                         pagenum += 1
1634
1635                 all_ids_count = len(video_ids)
1636                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1637                 playlistend = self._downloader.params.get('playlistend', -1)
1638
1639                 if playlistend == -1:
1640                         video_ids = video_ids[playliststart:]
1641                 else:
1642                         video_ids = video_ids[playliststart:playlistend]
1643
1644                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1645                                 (username, all_ids_count, len(video_ids)))
1646
1647                 for video_id in video_ids:
1648                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1649
1650
1651 class DepositFilesIE(InfoExtractor):
1652         """Information extractor for depositfiles.com"""
1653
1654         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1655         IE_NAME = u'DepositFiles'
1656
1657         def __init__(self, downloader=None):
1658                 InfoExtractor.__init__(self, downloader)
1659
1660         def report_download_webpage(self, file_id):
1661                 """Report webpage download."""
1662                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1663
1664         def report_extraction(self, file_id):
1665                 """Report information extraction."""
1666                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1667
1668         def _real_extract(self, url):
1669                 file_id = url.split('/')[-1]
1670                 # Rebuild url in english locale
1671                 url = 'http://depositfiles.com/en/files/' + file_id
1672
1673                 # Retrieve file webpage with 'Free download' button pressed
1674                 free_download_indication = { 'gateway_result' : '1' }
1675                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1676                 try:
1677                         self.report_download_webpage(file_id)
1678                         webpage = urllib2.urlopen(request).read()
1679                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1681                         return
1682
1683                 # Search for the real file URL
1684                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1685                 if (mobj is None) or (mobj.group(1) is None):
1686                         # Try to figure out reason of the error.
1687                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1688                         if (mobj is not None) and (mobj.group(1) is not None):
1689                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1690                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1691                         else:
1692                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1693                         return
1694
1695                 file_url = mobj.group(1)
1696                 file_extension = os.path.splitext(file_url)[1][1:]
1697
1698                 # Search for file title
1699                 mobj = re.search(r'<b title="(.*?)">', webpage)
1700                 if mobj is None:
1701                         self._downloader.trouble(u'ERROR: unable to extract title')
1702                         return
1703                 file_title = mobj.group(1).decode('utf-8')
1704
1705                 return [{
1706                         'id':           file_id.decode('utf-8'),
1707                         'url':          file_url.decode('utf-8'),
1708                         'uploader':     u'NA',
1709                         'upload_date':  u'NA',
1710                         'title':        file_title,
1711                         'stitle':       file_title,
1712                         'ext':          file_extension.decode('utf-8'),
1713                         'format':       u'NA',
1714                         'player_url':   None,
1715                 }]
1716
1717
1718 class FacebookIE(InfoExtractor):
1719         """Information Extractor for Facebook"""
1720
1721         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1722         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1723         _NETRC_MACHINE = 'facebook'
1724         _available_formats = ['video', 'highqual', 'lowqual']
1725         _video_extensions = {
1726                 'video': 'mp4',
1727                 'highqual': 'mp4',
1728                 'lowqual': 'mp4',
1729         }
1730         IE_NAME = u'facebook'
1731
1732         def __init__(self, downloader=None):
1733                 InfoExtractor.__init__(self, downloader)
1734
1735         def _reporter(self, message):
1736                 """Add header and report message."""
1737                 self._downloader.to_screen(u'[facebook] %s' % message)
1738
1739         def report_login(self):
1740                 """Report attempt to log in."""
1741                 self._reporter(u'Logging in')
1742
1743         def report_video_webpage_download(self, video_id):
1744                 """Report attempt to download video webpage."""
1745                 self._reporter(u'%s: Downloading video webpage' % video_id)
1746
1747         def report_information_extraction(self, video_id):
1748                 """Report attempt to extract video information."""
1749                 self._reporter(u'%s: Extracting video information' % video_id)
1750
1751         def _parse_page(self, video_webpage):
1752                 """Extract video information from page"""
1753                 # General data
1754                 data = {'title': r'\("video_title", "(.*?)"\)',
1755                         'description': r'<div class="datawrap">(.*?)</div>',
1756                         'owner': r'\("video_owner_name", "(.*?)"\)',
1757                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1758                         }
1759                 video_info = {}
1760                 for piece in data.keys():
1761                         mobj = re.search(data[piece], video_webpage)
1762                         if mobj is not None:
1763                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1764
1765                 # Video urls
1766                 video_urls = {}
1767                 for fmt in self._available_formats:
1768                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1769                         if mobj is not None:
1770                                 # URL is in a Javascript segment inside an escaped Unicode format within
1771                                 # the generally utf-8 page
1772                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1773                 video_info['video_urls'] = video_urls
1774
1775                 return video_info
1776
1777         def _real_initialize(self):
1778                 if self._downloader is None:
1779                         return
1780
1781                 useremail = None
1782                 password = None
1783                 downloader_params = self._downloader.params
1784
1785                 # Attempt to use provided username and password or .netrc data
1786                 if downloader_params.get('username', None) is not None:
1787                         useremail = downloader_params['username']
1788                         password = downloader_params['password']
1789                 elif downloader_params.get('usenetrc', False):
1790                         try:
1791                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1792                                 if info is not None:
1793                                         useremail = info[0]
1794                                         password = info[2]
1795                                 else:
1796                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1797                         except (IOError, netrc.NetrcParseError), err:
1798                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1799                                 return
1800
1801                 if useremail is None:
1802                         return
1803
1804                 # Log in
1805                 login_form = {
1806                         'email': useremail,
1807                         'pass': password,
1808                         'login': 'Log+In'
1809                         }
1810                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1811                 try:
1812                         self.report_login()
1813                         login_results = urllib2.urlopen(request).read()
1814                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1815                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1816                                 return
1817                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1818                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1819                         return
1820
1821         def _real_extract(self, url):
1822                 mobj = re.match(self._VALID_URL, url)
1823                 if mobj is None:
1824                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1825                         return
1826                 video_id = mobj.group('ID')
1827
1828                 # Get video webpage
1829                 self.report_video_webpage_download(video_id)
1830                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1831                 try:
1832                         page = urllib2.urlopen(request)
1833                         video_webpage = page.read()
1834                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1836                         return
1837
1838                 # Start extracting information
1839                 self.report_information_extraction(video_id)
1840
1841                 # Extract information
1842                 video_info = self._parse_page(video_webpage)
1843
1844                 # uploader
1845                 if 'owner' not in video_info:
1846                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1847                         return
1848                 video_uploader = video_info['owner']
1849
1850                 # title
1851                 if 'title' not in video_info:
1852                         self._downloader.trouble(u'ERROR: unable to extract video title')
1853                         return
1854                 video_title = video_info['title']
1855                 video_title = video_title.decode('utf-8')
1856                 video_title = sanitize_title(video_title)
1857
1858                 simple_title = simplify_title(video_title)
1859
1860                 # thumbnail image
1861                 if 'thumbnail' not in video_info:
1862                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1863                         video_thumbnail = ''
1864                 else:
1865                         video_thumbnail = video_info['thumbnail']
1866
1867                 # upload date
1868                 upload_date = u'NA'
1869                 if 'upload_date' in video_info:
1870                         upload_time = video_info['upload_date']
1871                         timetuple = email.utils.parsedate_tz(upload_time)
1872                         if timetuple is not None:
1873                                 try:
1874                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1875                                 except:
1876                                         pass
1877
1878                 # description
1879                 video_description = video_info.get('description', 'No description available.')
1880
1881                 url_map = video_info['video_urls']
1882                 if len(url_map.keys()) > 0:
1883                         # Decide which formats to download
1884                         req_format = self._downloader.params.get('format', None)
1885                         format_limit = self._downloader.params.get('format_limit', None)
1886
1887                         if format_limit is not None and format_limit in self._available_formats:
1888                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1889                         else:
1890                                 format_list = self._available_formats
1891                         existing_formats = [x for x in format_list if x in url_map]
1892                         if len(existing_formats) == 0:
1893                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1894                                 return
1895                         if req_format is None:
1896                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1897                         elif req_format == 'worst':
1898                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1899                         elif req_format == '-1':
1900                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1901                         else:
1902                                 # Specific format
1903                                 if req_format not in url_map:
1904                                         self._downloader.trouble(u'ERROR: requested format not available')
1905                                         return
1906                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1907
1908                 results = []
1909                 for format_param, video_real_url in video_url_list:
1910                         # Extension
1911                         video_extension = self._video_extensions.get(format_param, 'mp4')
1912
1913                         results.append({
1914                                 'id':           video_id.decode('utf-8'),
1915                                 'url':          video_real_url.decode('utf-8'),
1916                                 'uploader':     video_uploader.decode('utf-8'),
1917                                 'upload_date':  upload_date,
1918                                 'title':        video_title,
1919                                 'stitle':       simple_title,
1920                                 'ext':          video_extension.decode('utf-8'),
1921                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1922                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1923                                 'description':  video_description.decode('utf-8'),
1924                                 'player_url':   None,
1925                         })
1926                 return results
1927
1928 class BlipTVIE(InfoExtractor):
1929         """Information extractor for blip.tv"""
1930
1931         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1932         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1933         IE_NAME = u'blip.tv'
1934
1935         def report_extraction(self, file_id):
1936                 """Report information extraction."""
1937                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1938
1939         def report_direct_download(self, title):
1940                 """Report information extraction."""
1941                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1942
1943         def _real_extract(self, url):
1944                 mobj = re.match(self._VALID_URL, url)
1945                 if mobj is None:
1946                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1947                         return
1948
1949                 if '?' in url:
1950                         cchar = '&'
1951                 else:
1952                         cchar = '?'
1953                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1954                 request = urllib2.Request(json_url)
1955                 self.report_extraction(mobj.group(1))
1956                 info = None
1957                 try:
1958                         urlh = urllib2.urlopen(request)
1959                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1960                                 basename = url.split('/')[-1]
1961                                 title,ext = os.path.splitext(basename)
1962                                 title = title.decode('UTF-8')
1963                                 ext = ext.replace('.', '')
1964                                 self.report_direct_download(title)
1965                                 info = {
1966                                         'id': title,
1967                                         'url': url,
1968                                         'title': title,
1969                                         'stitle': simplify_title(title),
1970                                         'ext': ext,
1971                                         'urlhandle': urlh
1972                                 }
1973                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1974                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1975                         return
1976                 if info is None: # Regular URL
1977                         try:
1978                                 json_code = urlh.read()
1979                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1980                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1981                                 return
1982
1983                         try:
1984                                 json_data = json.loads(json_code)
1985                                 if 'Post' in json_data:
1986                                         data = json_data['Post']
1987                                 else:
1988                                         data = json_data
1989
1990                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1991                                 video_url = data['media']['url']
1992                                 umobj = re.match(self._URL_EXT, video_url)
1993                                 if umobj is None:
1994                                         raise ValueError('Can not determine filename extension')
1995                                 ext = umobj.group(1)
1996
1997                                 info = {
1998                                         'id': data['item_id'],
1999                                         'url': video_url,
2000                                         'uploader': data['display_name'],
2001                                         'upload_date': upload_date,
2002                                         'title': data['title'],
2003                                         'stitle': simplify_title(data['title']),
2004                                         'ext': ext,
2005                                         'format': data['media']['mimeType'],
2006                                         'thumbnail': data['thumbnailUrl'],
2007                                         'description': data['description'],
2008                                         'player_url': data['embedUrl']
2009                                 }
2010                         except (ValueError,KeyError), err:
2011                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2012                                 return
2013
2014                 return [info]
2015
2016
2017 class MyVideoIE(InfoExtractor):
2018         """Information Extractor for myvideo.de."""
2019
2020         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2021         IE_NAME = u'myvideo'
2022
2023         def __init__(self, downloader=None):
2024                 InfoExtractor.__init__(self, downloader)
2025
2026         def report_download_webpage(self, video_id):
2027                 """Report webpage download."""
2028                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2029
2030         def report_extraction(self, video_id):
2031                 """Report information extraction."""
2032                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2033
2034         def _real_extract(self,url):
2035                 mobj = re.match(self._VALID_URL, url)
2036                 if mobj is None:
2037                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2038                         return
2039
2040                 video_id = mobj.group(1)
2041
2042                 # Get video webpage
2043                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2044                 try:
2045                         self.report_download_webpage(video_id)
2046                         webpage = urllib2.urlopen(request).read()
2047                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2048                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2049                         return
2050
2051                 self.report_extraction(video_id)
2052                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2053                                  webpage)
2054                 if mobj is None:
2055                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2056                         return
2057                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2058
2059                 mobj = re.search('<title>([^<]+)</title>', webpage)
2060                 if mobj is None:
2061                         self._downloader.trouble(u'ERROR: unable to extract title')
2062                         return
2063
2064                 video_title = mobj.group(1)
2065                 video_title = sanitize_title(video_title)
2066
2067                 simple_title = simplify_title(video_title)
2068
2069                 return [{
2070                         'id':           video_id,
2071                         'url':          video_url,
2072                         'uploader':     u'NA',
2073                         'upload_date':  u'NA',
2074                         'title':        video_title,
2075                         'stitle':       simple_title,
2076                         'ext':          u'flv',
2077                         'format':       u'NA',
2078                         'player_url':   None,
2079                 }]
2080
2081 class ComedyCentralIE(InfoExtractor):
2082         """Information extractor for The Daily Show and Colbert Report """
2083
2084         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2085         IE_NAME = u'comedycentral'
2086
2087         def report_extraction(self, episode_id):
2088                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2089
2090         def report_config_download(self, episode_id):
2091                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2092
2093         def report_index_download(self, episode_id):
2094                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2095
2096         def report_player_url(self, episode_id):
2097                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2098
2099         def _real_extract(self, url):
2100                 mobj = re.match(self._VALID_URL, url)
2101                 if mobj is None:
2102                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2103                         return
2104
2105                 if mobj.group('shortname'):
2106                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2107                                 url = u'http://www.thedailyshow.com/full-episodes/'
2108                         else:
2109                                 url = u'http://www.colbertnation.com/full-episodes/'
2110                         mobj = re.match(self._VALID_URL, url)
2111                         assert mobj is not None
2112
2113                 dlNewest = not mobj.group('episode')
2114                 if dlNewest:
2115                         epTitle = mobj.group('showname')
2116                 else:
2117                         epTitle = mobj.group('episode')
2118
2119                 req = urllib2.Request(url)
2120                 self.report_extraction(epTitle)
2121                 try:
2122                         htmlHandle = urllib2.urlopen(req)
2123                         html = htmlHandle.read()
2124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2125                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2126                         return
2127                 if dlNewest:
2128                         url = htmlHandle.geturl()
2129                         mobj = re.match(self._VALID_URL, url)
2130                         if mobj is None:
2131                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2132                                 return
2133                         if mobj.group('episode') == '':
2134                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2135                                 return
2136                         epTitle = mobj.group('episode')
2137
2138                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2139                 if len(mMovieParams) == 0:
2140                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2141                         return
2142
2143                 playerUrl_raw = mMovieParams[0][0]
2144                 self.report_player_url(epTitle)
2145                 try:
2146                         urlHandle = urllib2.urlopen(playerUrl_raw)
2147                         playerUrl = urlHandle.geturl()
2148                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2150                         return
2151
2152                 uri = mMovieParams[0][1]
2153                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2154                 self.report_index_download(epTitle)
2155                 try:
2156                         indexXml = urllib2.urlopen(indexUrl).read()
2157                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2158                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2159                         return
2160
2161                 results = []
2162
2163                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2164                 itemEls = idoc.findall('.//item')
2165                 for itemEl in itemEls:
2166                         mediaId = itemEl.findall('./guid')[0].text
2167                         shortMediaId = mediaId.split(':')[-1]
2168                         showId = mediaId.split(':')[-2].replace('.com', '')
2169                         officialTitle = itemEl.findall('./title')[0].text
2170                         officialDate = itemEl.findall('./pubDate')[0].text
2171
2172                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2173                                                 urllib.urlencode({'uri': mediaId}))
2174                         configReq = urllib2.Request(configUrl)
2175                         self.report_config_download(epTitle)
2176                         try:
2177                                 configXml = urllib2.urlopen(configReq).read()
2178                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2179                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2180                                 return
2181
2182                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2183                         turls = []
2184                         for rendition in cdoc.findall('.//rendition'):
2185                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2186                                 turls.append(finfo)
2187
2188                         if len(turls) == 0:
2189                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2190                                 continue
2191
2192                         # For now, just pick the highest bitrate
2193                         format,video_url = turls[-1]
2194
2195                         effTitle = showId + u'-' + epTitle
2196                         info = {
2197                                 'id': shortMediaId,
2198                                 'url': video_url,
2199                                 'uploader': showId,
2200                                 'upload_date': officialDate,
2201                                 'title': effTitle,
2202                                 'stitle': simplify_title(effTitle),
2203                                 'ext': 'mp4',
2204                                 'format': format,
2205                                 'thumbnail': None,
2206                                 'description': officialTitle,
2207                                 'player_url': playerUrl
2208                         }
2209
2210                         results.append(info)
2211
2212                 return results
2213
2214
2215 class EscapistIE(InfoExtractor):
2216         """Information extractor for The Escapist """
2217
2218         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2219         IE_NAME = u'escapist'
2220
2221         def report_extraction(self, showName):
2222                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2223
2224         def report_config_download(self, showName):
2225                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2226
2227         def _real_extract(self, url):
2228                 mobj = re.match(self._VALID_URL, url)
2229                 if mobj is None:
2230                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2231                         return
2232                 showName = mobj.group('showname')
2233                 videoId = mobj.group('episode')
2234
2235                 self.report_extraction(showName)
2236                 try:
2237                         webPage = urllib2.urlopen(url).read()
2238                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2239                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2240                         return
2241
2242                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2243                 description = unescapeHTML(descMatch.group(1))
2244                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2245                 imgUrl = unescapeHTML(imgMatch.group(1))
2246                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2247                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2248                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2249                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2250
2251                 self.report_config_download(showName)
2252                 try:
2253                         configJSON = urllib2.urlopen(configUrl).read()
2254                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2256                         return
2257
2258                 # Technically, it's JavaScript, not JSON
2259                 configJSON = configJSON.replace("'", '"')
2260
2261                 try:
2262                         config = json.loads(configJSON)
2263                 except (ValueError,), err:
2264                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2265                         return
2266
2267                 playlist = config['playlist']
2268                 videoUrl = playlist[1]['url']
2269
2270                 info = {
2271                         'id': videoId,
2272                         'url': videoUrl,
2273                         'uploader': showName,
2274                         'upload_date': None,
2275                         'title': showName,
2276                         'stitle': simplify_title(showName),
2277                         'ext': 'flv',
2278                         'format': 'flv',
2279                         'thumbnail': imgUrl,
2280                         'description': description,
2281                         'player_url': playerUrl,
2282                 }
2283
2284                 return [info]
2285
2286
2287 class CollegeHumorIE(InfoExtractor):
2288         """Information extractor for collegehumor.com"""
2289
2290         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2291         IE_NAME = u'collegehumor'
2292
2293         def report_webpage(self, video_id):
2294                 """Report information extraction."""
2295                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2296
2297         def report_extraction(self, video_id):
2298                 """Report information extraction."""
2299                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2300
2301         def _real_extract(self, url):
2302                 mobj = re.match(self._VALID_URL, url)
2303                 if mobj is None:
2304                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2305                         return
2306                 video_id = mobj.group('videoid')
2307
2308                 self.report_webpage(video_id)
2309                 request = urllib2.Request(url)
2310                 try:
2311                         webpage = urllib2.urlopen(request).read()
2312                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2314                         return
2315
2316                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2317                 if m is None:
2318                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2319                         return
2320                 internal_video_id = m.group('internalvideoid')
2321
2322                 info = {
2323                         'id': video_id,
2324                         'internal_id': internal_video_id,
2325                 }
2326
2327                 self.report_extraction(video_id)
2328                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2329                 try:
2330                         metaXml = urllib2.urlopen(xmlUrl).read()
2331                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2333                         return
2334
2335                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2336                 try:
2337                         videoNode = mdoc.findall('./video')[0]
2338                         info['description'] = videoNode.findall('./description')[0].text
2339                         info['title'] = videoNode.findall('./caption')[0].text
2340                         info['stitle'] = simplify_title(info['title'])
2341                         info['url'] = videoNode.findall('./file')[0].text
2342                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2343                         info['ext'] = info['url'].rpartition('.')[2]
2344                         info['format'] = info['ext']
2345                 except IndexError:
2346                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2347                         return
2348
2349                 return [info]
2350
2351
2352 class XVideosIE(InfoExtractor):
2353         """Information extractor for xvideos.com"""
2354
2355         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2356         IE_NAME = u'xvideos'
2357
2358         def report_webpage(self, video_id):
2359                 """Report information extraction."""
2360                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2361
2362         def report_extraction(self, video_id):
2363                 """Report information extraction."""
2364                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2365
2366         def _real_extract(self, url):
2367                 mobj = re.match(self._VALID_URL, url)
2368                 if mobj is None:
2369                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2370                         return
2371                 video_id = mobj.group(1).decode('utf-8')
2372
2373                 self.report_webpage(video_id)
2374
2375                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2376                 try:
2377                         webpage = urllib2.urlopen(request).read()
2378                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2380                         return
2381
2382                 self.report_extraction(video_id)
2383
2384
2385                 # Extract video URL
2386                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2387                 if mobj is None:
2388                         self._downloader.trouble(u'ERROR: unable to extract video url')
2389                         return
2390                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2391
2392
2393                 # Extract title
2394                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2395                 if mobj is None:
2396                         self._downloader.trouble(u'ERROR: unable to extract video title')
2397                         return
2398                 video_title = mobj.group(1).decode('utf-8')
2399
2400
2401                 # Extract video thumbnail
2402                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2403                 if mobj is None:
2404                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2405                         return
2406                 video_thumbnail = mobj.group(1).decode('utf-8')
2407
2408                 info = {
2409                         'id': video_id,
2410                         'url': video_url,
2411                         'uploader': None,
2412                         'upload_date': None,
2413                         'title': video_title,
2414                         'stitle': simplify_title(video_title),
2415                         'ext': 'flv',
2416                         'format': 'flv',
2417                         'thumbnail': video_thumbnail,
2418                         'description': None,
2419                         'player_url': None,
2420                 }
2421
2422                 return [info]
2423
2424
2425 class SoundcloudIE(InfoExtractor):
2426         """Information extractor for soundcloud.com
2427            To access the media, the uid of the song and a stream token
2428            must be extracted from the page source and the script must make
2429            a request to media.soundcloud.com/crossdomain.xml. Then
2430            the media can be grabbed by requesting from an url composed
2431            of the stream token and uid
2432          """
2433
2434         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2435         IE_NAME = u'soundcloud'
2436
2437         def __init__(self, downloader=None):
2438                 InfoExtractor.__init__(self, downloader)
2439
2440         def report_webpage(self, video_id):
2441                 """Report information extraction."""
2442                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2443
2444         def report_extraction(self, video_id):
2445                 """Report information extraction."""
2446                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2447
2448         def _real_extract(self, url):
2449                 mobj = re.match(self._VALID_URL, url)
2450                 if mobj is None:
2451                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452                         return
2453
2454                 # extract uploader (which is in the url)
2455                 uploader = mobj.group(1).decode('utf-8')
2456                 # extract simple title (uploader + slug of song title)
2457                 slug_title =  mobj.group(2).decode('utf-8')
2458                 simple_title = uploader + '-' + slug_title
2459
2460                 self.report_webpage('%s/%s' % (uploader, slug_title))
2461
2462                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2463                 try:
2464                         webpage = urllib2.urlopen(request).read()
2465                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2467                         return
2468
2469                 self.report_extraction('%s/%s' % (uploader, slug_title))
2470
2471                 # extract uid and stream token that soundcloud hands out for access
2472                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2473                 if mobj:
2474                         video_id = mobj.group(1)
2475                         stream_token = mobj.group(2)
2476
2477                 # extract unsimplified title
2478                 mobj = re.search('"title":"(.*?)",', webpage)
2479                 if mobj:
2480                         title = mobj.group(1)
2481
2482                 # construct media url (with uid/token)
2483                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2484                 mediaURL = mediaURL % (video_id, stream_token)
2485
2486                 # description
2487                 description = u'No description available'
2488                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2489                 if mobj:
2490                         description = mobj.group(1)
2491
2492                 # upload date
2493                 upload_date = None
2494                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2495                 if mobj:
2496                         try:
2497                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2498                         except Exception, e:
2499                                 print str(e)
2500
2501                 # for soundcloud, a request to a cross domain is required for cookies
2502                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2503
2504                 return [{
2505                         'id':           video_id.decode('utf-8'),
2506                         'url':          mediaURL,
2507                         'uploader':     uploader.decode('utf-8'),
2508                         'upload_date':  upload_date,
2509                         'title':        simple_title.decode('utf-8'),
2510                         'stitle':       simple_title.decode('utf-8'),
2511                         'ext':          u'mp3',
2512                         'format':       u'NA',
2513                         'player_url':   None,
2514                         'description': description.decode('utf-8')
2515                 }]
2516
2517
2518 class InfoQIE(InfoExtractor):
2519         """Information extractor for infoq.com"""
2520
2521         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2522         IE_NAME = u'infoq'
2523
2524         def report_webpage(self, video_id):
2525                 """Report information extraction."""
2526                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2527
2528         def report_extraction(self, video_id):
2529                 """Report information extraction."""
2530                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2531
2532         def _real_extract(self, url):
2533                 mobj = re.match(self._VALID_URL, url)
2534                 if mobj is None:
2535                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2536                         return
2537
2538                 self.report_webpage(url)
2539
2540                 request = urllib2.Request(url)
2541                 try:
2542                         webpage = urllib2.urlopen(request).read()
2543                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2544                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2545                         return
2546
2547                 self.report_extraction(url)
2548
2549
2550                 # Extract video URL
2551                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2552                 if mobj is None:
2553                         self._downloader.trouble(u'ERROR: unable to extract video url')
2554                         return
2555                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2556
2557
2558                 # Extract title
2559                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2560                 if mobj is None:
2561                         self._downloader.trouble(u'ERROR: unable to extract video title')
2562                         return
2563                 video_title = mobj.group(1).decode('utf-8')
2564
2565                 # Extract description
2566                 video_description = u'No description available.'
2567                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2568                 if mobj is not None:
2569                         video_description = mobj.group(1).decode('utf-8')
2570
2571                 video_filename = video_url.split('/')[-1]
2572                 video_id, extension = video_filename.split('.')
2573
2574                 info = {
2575                         'id': video_id,
2576                         'url': video_url,
2577                         'uploader': None,
2578                         'upload_date': None,
2579                         'title': video_title,
2580                         'stitle': simplify_title(video_title),
2581                         'ext': extension,
2582                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2583                         'thumbnail': None,
2584                         'description': video_description,
2585                         'player_url': None,
2586                 }
2587
2588                 return [info]
2589
2590 class MixcloudIE(InfoExtractor):
2591         """Information extractor for www.mixcloud.com"""
2592         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2593         IE_NAME = u'mixcloud'
2594
2595         def __init__(self, downloader=None):
2596                 InfoExtractor.__init__(self, downloader)
2597
2598         def report_download_json(self, file_id):
2599                 """Report JSON download."""
2600                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2601
2602         def report_extraction(self, file_id):
2603                 """Report information extraction."""
2604                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2605
2606         def get_urls(self, jsonData, fmt, bitrate='best'):
2607                 """Get urls from 'audio_formats' section in json"""
2608                 file_url = None
2609                 try:
2610                         bitrate_list = jsonData[fmt]
2611                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2612                                 bitrate = max(bitrate_list) # select highest
2613
2614                         url_list = jsonData[fmt][bitrate]
2615                 except TypeError: # we have no bitrate info.
2616                         url_list = jsonData[fmt]
2617
2618                 return url_list
2619
2620         def check_urls(self, url_list):
2621                 """Returns 1st active url from list"""
2622                 for url in url_list:
2623                         try:
2624                                 urllib2.urlopen(url)
2625                                 return url
2626                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2627                                 url = None
2628
2629                 return None
2630
2631         def _print_formats(self, formats):
2632                 print 'Available formats:'
2633                 for fmt in formats.keys():
2634                         for b in formats[fmt]:
2635                                 try:
2636                                         ext = formats[fmt][b][0]
2637                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2638                                 except TypeError: # we have no bitrate info
2639                                         ext = formats[fmt][0]
2640                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2641                                         break
2642
2643         def _real_extract(self, url):
2644                 mobj = re.match(self._VALID_URL, url)
2645                 if mobj is None:
2646                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2647                         return
2648                 # extract uploader & filename from url
2649                 uploader = mobj.group(1).decode('utf-8')
2650                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2651
2652                 # construct API request
2653                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2654                 # retrieve .json file with links to files
2655                 request = urllib2.Request(file_url)
2656                 try:
2657                         self.report_download_json(file_url)
2658                         jsonData = urllib2.urlopen(request).read()
2659                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2660                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2661                         return
2662
2663                 # parse JSON
2664                 json_data = json.loads(jsonData)
2665                 player_url = json_data['player_swf_url']
2666                 formats = dict(json_data['audio_formats'])
2667
2668                 req_format = self._downloader.params.get('format', None)
2669                 bitrate = None
2670
2671                 if self._downloader.params.get('listformats', None):
2672                         self._print_formats(formats)
2673                         return
2674
2675                 if req_format is None or req_format == 'best':
2676                         for format_param in formats.keys():
2677                                 url_list = self.get_urls(formats, format_param)
2678                                 # check urls
2679                                 file_url = self.check_urls(url_list)
2680                                 if file_url is not None:
2681                                         break # got it!
2682                 else:
2683                         if req_format not in formats.keys():
2684                                 self._downloader.trouble(u'ERROR: format is not available')
2685                                 return
2686
2687                         url_list = self.get_urls(formats, req_format)
2688                         file_url = self.check_urls(url_list)
2689                         format_param = req_format
2690
2691                 return [{
2692                         'id': file_id.decode('utf-8'),
2693                         'url': file_url.decode('utf-8'),
2694                         'uploader':     uploader.decode('utf-8'),
2695                         'upload_date': u'NA',
2696                         'title': json_data['name'],
2697                         'stitle': simplify_title(json_data['name']),
2698                         'ext': file_url.split('.')[-1].decode('utf-8'),
2699                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2700                         'thumbnail': json_data['thumbnail_url'],
2701                         'description': json_data['description'],
2702                         'player_url': player_url.decode('utf-8'),
2703                 }]
2704
2705 class StanfordOpenClassroomIE(InfoExtractor):
2706         """Information extractor for Stanford's Open ClassRoom"""
2707
2708         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2709         IE_NAME = u'stanfordoc'
2710
2711         def report_download_webpage(self, objid):
2712                 """Report information extraction."""
2713                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2714
2715         def report_extraction(self, video_id):
2716                 """Report information extraction."""
2717                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2718
2719         def _real_extract(self, url):
2720                 mobj = re.match(self._VALID_URL, url)
2721                 if mobj is None:
2722                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2723                         return
2724
2725                 if mobj.group('course') and mobj.group('video'): # A specific video
2726                         course = mobj.group('course')
2727                         video = mobj.group('video')
2728                         info = {
2729                                 'id': simplify_title(course + '_' + video),
2730                         }
2731
2732                         self.report_extraction(info['id'])
2733                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2734                         xmlUrl = baseUrl + video + '.xml'
2735                         try:
2736                                 metaXml = urllib2.urlopen(xmlUrl).read()
2737                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2738                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2739                                 return
2740                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2741                         try:
2742                                 info['title'] = mdoc.findall('./title')[0].text
2743                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2744                         except IndexError:
2745                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2746                                 return
2747                         info['stitle'] = simplify_title(info['title'])
2748                         info['ext'] = info['url'].rpartition('.')[2]
2749                         info['format'] = info['ext']
2750                         return [info]
2751                 elif mobj.group('course'): # A course page
2752                         course = mobj.group('course')
2753                         info = {
2754                                 'id': simplify_title(course),
2755                                 'type': 'playlist',
2756                         }
2757
2758                         self.report_download_webpage(info['id'])
2759                         try:
2760                                 coursepage = urllib2.urlopen(url).read()
2761                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2762                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2763                                 return
2764
2765                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2766                         if m:
2767                                 info['title'] = unescapeHTML(m.group(1))
2768                         else:
2769                                 info['title'] = info['id']
2770                         info['stitle'] = simplify_title(info['title'])
2771
2772                         m = re.search('<description>([^<]+)</description>', coursepage)
2773                         if m:
2774                                 info['description'] = unescapeHTML(m.group(1))
2775
2776                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2777                         info['list'] = [
2778                                 {
2779                                         'type': 'reference',
2780                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2781                                 }
2782                                         for vpage in links]
2783                         results = []
2784                         for entry in info['list']:
2785                                 assert entry['type'] == 'reference'
2786                                 results += self.extract(entry['url'])
2787                         return results
2788
2789                 else: # Root page
2790                         info = {
2791                                 'id': 'Stanford OpenClassroom',
2792                                 'type': 'playlist',
2793                         }
2794
2795                         self.report_download_webpage(info['id'])
2796                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2797                         try:
2798                                 rootpage = urllib2.urlopen(rootURL).read()
2799                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2800                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2801                                 return
2802
2803                         info['title'] = info['id']
2804                         info['stitle'] = simplify_title(info['title'])
2805
2806                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2807                         info['list'] = [
2808                                 {
2809                                         'type': 'reference',
2810                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2811                                 }
2812                                         for cpage in links]
2813
2814                         results = []
2815                         for entry in info['list']:
2816                                 assert entry['type'] == 'reference'
2817                                 results += self.extract(entry['url'])
2818                         return results
2819
2820 class MTVIE(InfoExtractor):
2821         """Information extractor for MTV.com"""
2822
2823         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2824         IE_NAME = u'mtv'
2825
2826         def report_webpage(self, video_id):
2827                 """Report information extraction."""
2828                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2829
2830         def report_extraction(self, video_id):
2831                 """Report information extraction."""
2832                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2833
2834         def _real_extract(self, url):
2835                 mobj = re.match(self._VALID_URL, url)
2836                 if mobj is None:
2837                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2838                         return
2839                 if not mobj.group('proto'):
2840                         url = 'http://' + url
2841                 video_id = mobj.group('videoid')
2842                 self.report_webpage(video_id)
2843
2844                 request = urllib2.Request(url)
2845                 try:
2846                         webpage = urllib2.urlopen(request).read()
2847                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2848                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2849                         return
2850
2851                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2852                 if mobj is None:
2853                         self._downloader.trouble(u'ERROR: unable to extract song name')
2854                         return
2855                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2856                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2857                 if mobj is None:
2858                         self._downloader.trouble(u'ERROR: unable to extract performer')
2859                         return
2860                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2861                 video_title = performer + ' - ' + song_name
2862
2863                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2864                 if mobj is None:
2865                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2866                         return
2867                 mtvn_uri = mobj.group(1)
2868
2869                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2870                 if mobj is None:
2871                         self._downloader.trouble(u'ERROR: unable to extract content id')
2872                         return
2873                 content_id = mobj.group(1)
2874
2875                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2876                 self.report_extraction(video_id)
2877                 request = urllib2.Request(videogen_url)
2878                 try:
2879                         metadataXml = urllib2.urlopen(request).read()
2880                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2882                         return
2883
2884                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2885                 renditions = mdoc.findall('.//rendition')
2886
2887                 # For now, always pick the highest quality.
2888                 rendition = renditions[-1]
2889
2890                 try:
2891                         _,_,ext = rendition.attrib['type'].partition('/')
2892                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2893                         video_url = rendition.find('./src').text
2894                 except KeyError:
2895                         self._downloader.trouble('Invalid rendition field.')
2896                         return
2897
2898                 info = {
2899                         'id': video_id,
2900                         'url': video_url,
2901                         'uploader': performer,
2902                         'title': video_title,
2903                         'stitle': simplify_title(video_title),
2904                         'ext': ext,
2905                         'format': format,
2906                 }
2907
2908                 return [info]