_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         stitle:         Simplified title.
  43         ext:            Video filename extension.
  44         format:         Video format.
  45         player_url:     SWF Player URL (may be None).
  46
  47         The following fields are optional. Their primary purpose is to allow
  48         youtube-dl to serve as the backend for a video search function, such
  49         as the one in youtube2mp3.  They are only used when their respective
  50         forced printing functions are called:
  51
  52         thumbnail:      Full URL to a video thumbnail image.
  53         description:    One-line video description.
  54
  55         Subclasses of this one should re-define the _real_initialize() and
  56         _real_extract() methods and define a _VALID_URL regexp.
  57         Probably, they should also be added to the list of extractors.
  58         """
  59
  60         _ready = False
  61         _downloader = None
  62
  63         def __init__(self, downloader=None):
  64                 """Constructor. Receives an optional downloader."""
  65                 self._ready = False
  66                 self.set_downloader(downloader)
  67
  68         def suitable(self, url):
  69                 """Receives a URL and returns True if suitable for this IE."""
  70                 return re.match(self._VALID_URL, url) is not None
  71
  72         def initialize(self):
  73                 """Initializes an instance (authentication, etc)."""
  74                 if not self._ready:
  75                         self._real_initialize()
  76                         self._ready = True
  77
  78         def extract(self, url):
  79                 """Extracts URL information and returns it in list of dicts."""
  80                 self.initialize()
  81                 return self._real_extract(url)
  82
  83         def set_downloader(self, downloader):
  84                 """Sets the downloader for this IE."""
  85                 self._downloader = downloader
  86
  87         def _real_initialize(self):
  88                 """Real initialization process. Redefine in subclasses."""
  89                 pass
  90
  91         def _real_extract(self, url):
  92                 """Real extraction process. Redefine in subclasses."""
  93                 pass
  94
  95
  96 class YoutubeIE(InfoExtractor):
  97         """Information extractor for youtube.com."""
  98
  99         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 100         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 101         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 102         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 103         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 104         _NETRC_MACHINE = 'youtube'
 105         # Listed in order of quality
 106         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 107         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 108         _video_extensions = {
 109                 '13': '3gp',
 110                 '17': 'mp4',
 111                 '18': 'mp4',
 112                 '22': 'mp4',
 113                 '37': 'mp4',
 114                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 115                 '43': 'webm',
 116                 '44': 'webm',
 117                 '45': 'webm',
 118                 '46': 'webm',
 119         }
 120         _video_dimensions = {
 121                 '5': '240x400',
 122                 '6': '???',
 123                 '13': '???',
 124                 '17': '144x176',
 125                 '18': '360x640',
 126                 '22': '720x1280',
 127                 '34': '360x640',
 128                 '35': '480x854',
 129                 '37': '1080x1920',
 130                 '38': '3072x4096',
 131                 '43': '360x640',
 132                 '44': '480x854',
 133                 '45': '720x1280',
 134                 '46': '1080x1920',
 135         }
 136         IE_NAME = u'youtube'
 137
 138         def report_lang(self):
 139                 """Report attempt to set language."""
 140                 self._downloader.to_screen(u'[youtube] Setting language')
 141
 142         def report_login(self):
 143                 """Report attempt to log in."""
 144                 self._downloader.to_screen(u'[youtube] Logging in')
 145
 146         def report_age_confirmation(self):
 147                 """Report attempt to confirm age."""
 148                 self._downloader.to_screen(u'[youtube] Confirming age')
 149
 150         def report_video_webpage_download(self, video_id):
 151                 """Report attempt to download video webpage."""
 152                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 153
 154         def report_video_info_webpage_download(self, video_id):
 155                 """Report attempt to download video info webpage."""
 156                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 157
 158         def report_video_subtitles_download(self, video_id):
 159                 """Report attempt to download video info webpage."""
 160                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 161
 162         def report_information_extraction(self, video_id):
 163                 """Report attempt to extract video information."""
 164                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 165
 166         def report_unavailable_format(self, video_id, format):
 167                 """Report extracted video URL."""
 168                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 169
 170         def report_rtmp_download(self):
 171                 """Indicate the download will use the RTMP protocol."""
 172                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 173
 174         def _closed_captions_xml_to_srt(self, xml_string):
 175                 srt = ''
 176                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 177                 # TODO parse xml instead of regex
 178                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 179                         if not dur: dur = '4'
 180                         start = float(start)
 181                         end = start + float(dur)
 182                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 183                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 184                         caption = unescapeHTML(caption)
 185                         caption = unescapeHTML(caption) # double cycle, inentional
 186                         srt += str(n) + '\n'
 187                         srt += start + ' --> ' + end + '\n'
 188                         srt += caption + '\n\n'
 189                 return srt
 190
 191         def _print_formats(self, formats):
 192                 print 'Available formats:'
 193                 for x in formats:
 194                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 195
 196         def _real_initialize(self):
 197                 if self._downloader is None:
 198                         return
 199
 200                 username = None
 201                 password = None
 202                 downloader_params = self._downloader.params
 203
 204                 # Attempt to use provided username and password or .netrc data
 205                 if downloader_params.get('username', None) is not None:
 206                         username = downloader_params['username']
 207                         password = downloader_params['password']
 208                 elif downloader_params.get('usenetrc', False):
 209                         try:
 210                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 211                                 if info is not None:
 212                                         username = info[0]
 213                                         password = info[2]
 214                                 else:
 215                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 216                         except (IOError, netrc.NetrcParseError), err:
 217                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 218                                 return
 219
 220                 # Set language
 221                 request = urllib2.Request(self._LANG_URL)
 222                 try:
 223                         self.report_lang()
 224                         urllib2.urlopen(request).read()
 225                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 226                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 227                         return
 228
 229                 # No authentication to be performed
 230                 if username is None:
 231                         return
 232
 233                 # Log in
 234                 login_form = {
 235                                 'current_form': 'loginForm',
 236                                 'next':         '/',
 237                                 'action_login': 'Log In',
 238                                 'username':     username,
 239                                 'password':     password,
 240                                 }
 241                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 242                 try:
 243                         self.report_login()
 244                         login_results = urllib2.urlopen(request).read()
 245                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 246                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 247                                 return
 248                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 249                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 250                         return
 251
 252                 # Confirm age
 253                 age_form = {
 254                                 'next_url':             '/',
 255                                 'action_confirm':       'Confirm',
 256                                 }
 257                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 258                 try:
 259                         self.report_age_confirmation()
 260                         age_results = urllib2.urlopen(request).read()
 261                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 262                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 263                         return
 264
 265         def _real_extract(self, url):
 266                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 267                 mobj = re.search(self._NEXT_URL_RE, url)
 268                 if mobj:
 269                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 270
 271                 # Extract video id from URL
 272                 mobj = re.match(self._VALID_URL, url)
 273                 if mobj is None:
 274                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 275                         return
 276                 video_id = mobj.group(2)
 277
 278                 # Get video webpage
 279                 self.report_video_webpage_download(video_id)
 280                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 281                 try:
 282                         video_webpage = urllib2.urlopen(request).read()
 283                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 284                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 285                         return
 286
 287                 # Attempt to extract SWF player URL
 288                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 289                 if mobj is not None:
 290                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 291                 else:
 292                         player_url = None
 293
 294                 # Get video info
 295                 self.report_video_info_webpage_download(video_id)
 296                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 297                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 298                                         % (video_id, el_type))
 299                         request = urllib2.Request(video_info_url)
 300                         try:
 301                                 video_info_webpage = urllib2.urlopen(request).read()
 302                                 video_info = parse_qs(video_info_webpage)
 303                                 if 'token' in video_info:
 304                                         break
 305                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 306                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 307                                 return
 308                 if 'token' not in video_info:
 309                         if 'reason' in video_info:
 310                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 311                         else:
 312                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 313                         return
 314
 315                 # Start extracting information
 316                 self.report_information_extraction(video_id)
 317
 318                 # uploader
 319                 if 'author' not in video_info:
 320                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 321                         return
 322                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 323
 324                 # title
 325                 if 'title' not in video_info:
 326                         self._downloader.trouble(u'ERROR: unable to extract video title')
 327                         return
 328                 video_title = urllib.unquote_plus(video_info['title'][0])
 329                 video_title = video_title.decode('utf-8')
 330                 video_title = sanitize_title(video_title)
 331
 332                 # simplified title
 333                 simple_title = simplify_title(video_title)
 334
 335                 # thumbnail image
 336                 if 'thumbnail_url' not in video_info:
 337                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 338                         video_thumbnail = ''
 339                 else:   # don't panic if we can't find it
 340                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 341
 342                 # upload date
 343                 upload_date = u'NA'
 344                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 345                 if mobj is not None:
 346                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 347                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 348                         for expression in format_expressions:
 349                                 try:
 350                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 351                                 except:
 352                                         pass
 353
 354                 # description
 355                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 356                 if video_description: video_description = clean_html(video_description)
 357                 else: video_description = ''
 358
 359                 # closed captions
 360                 video_subtitles = None
 361                 if self._downloader.params.get('writesubtitles', False):
 362                         self.report_video_subtitles_download(video_id)
 363                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 364                         try:
 365                                 srt_list = urllib2.urlopen(request).read()
 366                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 367                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 368                         else:
 369                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
 370                                 if srt_lang_list:
 371                                         if self._downloader.params.get('subtitleslang', False):
 372                                                 srt_lang = self._downloader.params.get('subtitleslang')
 373                                         elif 'en' in srt_lang_list:
 374                                                 srt_lang = 'en'
 375                                         else:
 376                                                 srt_lang = srt_lang_list[0]
 377                                         if not srt_lang in srt_lang_list:
 378                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
 379                                         else:
 380                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
 381                                                 try:
 382                                                         srt_xml = urllib2.urlopen(request).read()
 383                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 384                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 385                                                 else:
 386                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 387                                 else:
 388                                         self._downloader.trouble(u'WARNING: video has no closed captions')
 389
 390                 # token
 391                 video_token = urllib.unquote_plus(video_info['token'][0])
 392
 393                 # Decide which formats to download
 394                 req_format = self._downloader.params.get('format', None)
 395
 396                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 397                         self.report_rtmp_download()
 398                         video_url_list = [(None, video_info['conn'][0])]
 399                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 400                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 401                         url_data = [parse_qs(uds) for uds in url_data_strs]
 402                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 403                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 404
 405                         format_limit = self._downloader.params.get('format_limit', None)
 406                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 407                         if format_limit is not None and format_limit in available_formats:
 408                                 format_list = available_formats[available_formats.index(format_limit):]
 409                         else:
 410                                 format_list = available_formats
 411                         existing_formats = [x for x in format_list if x in url_map]
 412                         if len(existing_formats) == 0:
 413                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 414                                 return
 415                         if self._downloader.params.get('listformats', None):
 416                                 self._print_formats(existing_formats)
 417                                 return
 418                         if req_format is None or req_format == 'best':
 419                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 420                         elif req_format == 'worst':
 421                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 422                         elif req_format in ('-1', 'all'):
 423                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 424                         else:
 425                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 426                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 427                                 req_formats = req_format.split('/')
 428                                 video_url_list = None
 429                                 for rf in req_formats:
 430                                         if rf in url_map:
 431                                                 video_url_list = [(rf, url_map[rf])]
 432                                                 break
 433                                 if video_url_list is None:
 434                                         self._downloader.trouble(u'ERROR: requested format not available')
 435                                         return
 436                 else:
 437                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 438                         return
 439
 440                 results = []
 441                 for format_param, video_real_url in video_url_list:
 442                         # Extension
 443                         video_extension = self._video_extensions.get(format_param, 'flv')
 444
 445                         results.append({
 446                                 'id':           video_id.decode('utf-8'),
 447                                 'url':          video_real_url.decode('utf-8'),
 448                                 'uploader':     video_uploader.decode('utf-8'),
 449                                 'upload_date':  upload_date,
 450                                 'title':        video_title,
 451                                 'stitle':       simple_title,
 452                                 'ext':          video_extension.decode('utf-8'),
 453                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 454                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 455                                 'description':  video_description,
 456                                 'player_url':   player_url,
 457                                 'subtitles':    video_subtitles
 458                         })
 459                 return results
 460
 461
 462 class MetacafeIE(InfoExtractor):
 463         """Information Extractor for metacafe.com."""
 464
 465         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 466         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 467         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 468         IE_NAME = u'metacafe'
 469
 470         def __init__(self, downloader=None):
 471                 InfoExtractor.__init__(self, downloader)
 472
 473         def report_disclaimer(self):
 474                 """Report disclaimer retrieval."""
 475                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 476
 477         def report_age_confirmation(self):
 478                 """Report attempt to confirm age."""
 479                 self._downloader.to_screen(u'[metacafe] Confirming age')
 480
 481         def report_download_webpage(self, video_id):
 482                 """Report webpage download."""
 483                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 484
 485         def report_extraction(self, video_id):
 486                 """Report information extraction."""
 487                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 488
 489         def _real_initialize(self):
 490                 # Retrieve disclaimer
 491                 request = urllib2.Request(self._DISCLAIMER)
 492                 try:
 493                         self.report_disclaimer()
 494                         disclaimer = urllib2.urlopen(request).read()
 495                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 496                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 497                         return
 498
 499                 # Confirm age
 500                 disclaimer_form = {
 501                         'filters': '0',
 502                         'submit': "Continue - I'm over 18",
 503                         }
 504                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 505                 try:
 506                         self.report_age_confirmation()
 507                         disclaimer = urllib2.urlopen(request).read()
 508                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 509                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 510                         return
 511
 512         def _real_extract(self, url):
 513                 # Extract id and simplified title from URL
 514                 mobj = re.match(self._VALID_URL, url)
 515                 if mobj is None:
 516                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 517                         return
 518
 519                 video_id = mobj.group(1)
 520
 521                 # Check if video comes from YouTube
 522                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 523                 if mobj2 is not None:
 524                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 525                         return
 526
 527                 simple_title = mobj.group(2).decode('utf-8')
 528
 529                 # Retrieve video webpage to extract further information
 530                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 531                 try:
 532                         self.report_download_webpage(video_id)
 533                         webpage = urllib2.urlopen(request).read()
 534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 535                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 536                         return
 537
 538                 # Extract URL, uploader and title from webpage
 539                 self.report_extraction(video_id)
 540                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 541                 if mobj is not None:
 542                         mediaURL = urllib.unquote(mobj.group(1))
 543                         video_extension = mediaURL[-3:]
 544
 545                         # Extract gdaKey if available
 546                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 547                         if mobj is None:
 548                                 video_url = mediaURL
 549                         else:
 550                                 gdaKey = mobj.group(1)
 551                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 552                 else:
 553                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 554                         if mobj is None:
 555                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 556                                 return
 557                         vardict = parse_qs(mobj.group(1))
 558                         if 'mediaData' not in vardict:
 559                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 560                                 return
 561                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 562                         if mobj is None:
 563                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 564                                 return
 565                         mediaURL = mobj.group(1).replace('\\/', '/')
 566                         video_extension = mediaURL[-3:]
 567                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 568
 569                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 570                 if mobj is None:
 571                         self._downloader.trouble(u'ERROR: unable to extract title')
 572                         return
 573                 video_title = mobj.group(1).decode('utf-8')
 574                 video_title = sanitize_title(video_title)
 575
 576                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 577                 if mobj is None:
 578                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 579                         return
 580                 video_uploader = mobj.group(1)
 581
 582                 return [{
 583                         'id':           video_id.decode('utf-8'),
 584                         'url':          video_url.decode('utf-8'),
 585                         'uploader':     video_uploader.decode('utf-8'),
 586                         'upload_date':  u'NA',
 587                         'title':        video_title,
 588                         'stitle':       simple_title,
 589                         'ext':          video_extension.decode('utf-8'),
 590                         'format':       u'NA',
 591                         'player_url':   None,
 592                 }]
 593
 594
 595 class DailymotionIE(InfoExtractor):
 596         """Information Extractor for Dailymotion"""
 597
 598         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 599         IE_NAME = u'dailymotion'
 600
 601         def __init__(self, downloader=None):
 602                 InfoExtractor.__init__(self, downloader)
 603
 604         def report_download_webpage(self, video_id):
 605                 """Report webpage download."""
 606                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 607
 608         def report_extraction(self, video_id):
 609                 """Report information extraction."""
 610                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 611
 612         def _real_extract(self, url):
 613                 # Extract id and simplified title from URL
 614                 mobj = re.match(self._VALID_URL, url)
 615                 if mobj is None:
 616                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 617                         return
 618
 619                 video_id = mobj.group(1)
 620
 621                 video_extension = 'flv'
 622
 623                 # Retrieve video webpage to extract further information
 624                 request = urllib2.Request(url)
 625                 request.add_header('Cookie', 'family_filter=off')
 626                 try:
 627                         self.report_download_webpage(video_id)
 628                         webpage = urllib2.urlopen(request).read()
 629                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 630                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 631                         return
 632
 633                 # Extract URL, uploader and title from webpage
 634                 self.report_extraction(video_id)
 635                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 636                 if mobj is None:
 637                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 638                         return
 639                 sequence = urllib.unquote(mobj.group(1))
 640                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 641                 if mobj is None:
 642                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 643                         return
 644                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 645
 646                 # if needed add http://www.dailymotion.com/ if relative URL
 647
 648                 video_url = mediaURL
 649
 650                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 651                 if mobj is None:
 652                         self._downloader.trouble(u'ERROR: unable to extract title')
 653                         return
 654                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 655                 video_title = sanitize_title(video_title)
 656                 simple_title = simplify_title(video_title)
 657
 658                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 659                 if mobj is None:
 660                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 661                         return
 662                 video_uploader = mobj.group(1)
 663
 664                 return [{
 665                         'id':           video_id.decode('utf-8'),
 666                         'url':          video_url.decode('utf-8'),
 667                         'uploader':     video_uploader.decode('utf-8'),
 668                         'upload_date':  u'NA',
 669                         'title':        video_title,
 670                         'stitle':       simple_title,
 671                         'ext':          video_extension.decode('utf-8'),
 672                         'format':       u'NA',
 673                         'player_url':   None,
 674                 }]
 675
 676
 677 class GoogleIE(InfoExtractor):
 678         """Information extractor for video.google.com."""
 679
 680         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 681         IE_NAME = u'video.google'
 682
 683         def __init__(self, downloader=None):
 684                 InfoExtractor.__init__(self, downloader)
 685
 686         def report_download_webpage(self, video_id):
 687                 """Report webpage download."""
 688                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 689
 690         def report_extraction(self, video_id):
 691                 """Report information extraction."""
 692                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 693
 694         def _real_extract(self, url):
 695                 # Extract id from URL
 696                 mobj = re.match(self._VALID_URL, url)
 697                 if mobj is None:
 698                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 699                         return
 700
 701                 video_id = mobj.group(1)
 702
 703                 video_extension = 'mp4'
 704
 705                 # Retrieve video webpage to extract further information
 706                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 707                 try:
 708                         self.report_download_webpage(video_id)
 709                         webpage = urllib2.urlopen(request).read()
 710                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 711                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 712                         return
 713
 714                 # Extract URL, uploader, and title from webpage
 715                 self.report_extraction(video_id)
 716                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 717                 if mobj is None:
 718                         video_extension = 'flv'
 719                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 720                 if mobj is None:
 721                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 722                         return
 723                 mediaURL = urllib.unquote(mobj.group(1))
 724                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 725                 mediaURL = mediaURL.replace('\\x26', '\x26')
 726
 727                 video_url = mediaURL
 728
 729                 mobj = re.search(r'<title>(.*)</title>', webpage)
 730                 if mobj is None:
 731                         self._downloader.trouble(u'ERROR: unable to extract title')
 732                         return
 733                 video_title = mobj.group(1).decode('utf-8')
 734                 video_title = sanitize_title(video_title)
 735                 simple_title = simplify_title(video_title)
 736
 737                 # Extract video description
 738                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 739                 if mobj is None:
 740                         self._downloader.trouble(u'ERROR: unable to extract video description')
 741                         return
 742                 video_description = mobj.group(1).decode('utf-8')
 743                 if not video_description:
 744                         video_description = 'No description available.'
 745
 746                 # Extract video thumbnail
 747                 if self._downloader.params.get('forcethumbnail', False):
 748                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 749                         try:
 750                                 webpage = urllib2.urlopen(request).read()
 751                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 752                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 753                                 return
 754                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 755                         if mobj is None:
 756                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 757                                 return
 758                         video_thumbnail = mobj.group(1)
 759                 else:   # we need something to pass to process_info
 760                         video_thumbnail = ''
 761
 762                 return [{
 763                         'id':           video_id.decode('utf-8'),
 764                         'url':          video_url.decode('utf-8'),
 765                         'uploader':     u'NA',
 766                         'upload_date':  u'NA',
 767                         'title':        video_title,
 768                         'stitle':       simple_title,
 769                         'ext':          video_extension.decode('utf-8'),
 770                         'format':       u'NA',
 771                         'player_url':   None,
 772                 }]
 773
 774
 775 class PhotobucketIE(InfoExtractor):
 776         """Information extractor for photobucket.com."""
 777
 778         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 779         IE_NAME = u'photobucket'
 780
 781         def __init__(self, downloader=None):
 782                 InfoExtractor.__init__(self, downloader)
 783
 784         def report_download_webpage(self, video_id):
 785                 """Report webpage download."""
 786                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 787
 788         def report_extraction(self, video_id):
 789                 """Report information extraction."""
 790                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 791
 792         def _real_extract(self, url):
 793                 # Extract id from URL
 794                 mobj = re.match(self._VALID_URL, url)
 795                 if mobj is None:
 796                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 797                         return
 798
 799                 video_id = mobj.group(1)
 800
 801                 video_extension = 'flv'
 802
 803                 # Retrieve video webpage to extract further information
 804                 request = urllib2.Request(url)
 805                 try:
 806                         self.report_download_webpage(video_id)
 807                         webpage = urllib2.urlopen(request).read()
 808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 809                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 810                         return
 811
 812                 # Extract URL, uploader, and title from webpage
 813                 self.report_extraction(video_id)
 814                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 815                 if mobj is None:
 816                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 817                         return
 818                 mediaURL = urllib.unquote(mobj.group(1))
 819
 820                 video_url = mediaURL
 821
 822                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 823                 if mobj is None:
 824                         self._downloader.trouble(u'ERROR: unable to extract title')
 825                         return
 826                 video_title = mobj.group(1).decode('utf-8')
 827                 video_title = sanitize_title(video_title)
 828                 simple_title = simplify_title(video_title)
 829
 830                 video_uploader = mobj.group(2).decode('utf-8')
 831
 832                 return [{
 833                         'id':           video_id.decode('utf-8'),
 834                         'url':          video_url.decode('utf-8'),
 835                         'uploader':     video_uploader,
 836                         'upload_date':  u'NA',
 837                         'title':        video_title,
 838                         'stitle':       simple_title,
 839                         'ext':          video_extension.decode('utf-8'),
 840                         'format':       u'NA',
 841                         'player_url':   None,
 842                 }]
 843
 844
 845 class YahooIE(InfoExtractor):
 846         """Information extractor for video.yahoo.com."""
 847
 848         # _VALID_URL matches all Yahoo! Video URLs
 849         # _VPAGE_URL matches only the extractable '/watch/' URLs
 850         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 851         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 852         IE_NAME = u'video.yahoo'
 853
 854         def __init__(self, downloader=None):
 855                 InfoExtractor.__init__(self, downloader)
 856
 857         def report_download_webpage(self, video_id):
 858                 """Report webpage download."""
 859                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 860
 861         def report_extraction(self, video_id):
 862                 """Report information extraction."""
 863                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 864
 865         def _real_extract(self, url, new_video=True):
 866                 # Extract ID from URL
 867                 mobj = re.match(self._VALID_URL, url)
 868                 if mobj is None:
 869                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 870                         return
 871
 872                 video_id = mobj.group(2)
 873                 video_extension = 'flv'
 874
 875                 # Rewrite valid but non-extractable URLs as
 876                 # extractable English language /watch/ URLs
 877                 if re.match(self._VPAGE_URL, url) is None:
 878                         request = urllib2.Request(url)
 879                         try:
 880                                 webpage = urllib2.urlopen(request).read()
 881                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 882                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 883                                 return
 884
 885                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 886                         if mobj is None:
 887                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 888                                 return
 889                         yahoo_id = mobj.group(1)
 890
 891                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 892                         if mobj is None:
 893                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 894                                 return
 895                         yahoo_vid = mobj.group(1)
 896
 897                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 898                         return self._real_extract(url, new_video=False)
 899
 900                 # Retrieve video webpage to extract further information
 901                 request = urllib2.Request(url)
 902                 try:
 903                         self.report_download_webpage(video_id)
 904                         webpage = urllib2.urlopen(request).read()
 905                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 906                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 907                         return
 908
 909                 # Extract uploader and title from webpage
 910                 self.report_extraction(video_id)
 911                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 912                 if mobj is None:
 913                         self._downloader.trouble(u'ERROR: unable to extract video title')
 914                         return
 915                 video_title = mobj.group(1).decode('utf-8')
 916                 simple_title = simplify_title(video_title)
 917
 918                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 919                 if mobj is None:
 920                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 921                         return
 922                 video_uploader = mobj.group(1).decode('utf-8')
 923
 924                 # Extract video thumbnail
 925                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 926                 if mobj is None:
 927                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 928                         return
 929                 video_thumbnail = mobj.group(1).decode('utf-8')
 930
 931                 # Extract video description
 932                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 933                 if mobj is None:
 934                         self._downloader.trouble(u'ERROR: unable to extract video description')
 935                         return
 936                 video_description = mobj.group(1).decode('utf-8')
 937                 if not video_description:
 938                         video_description = 'No description available.'
 939
 940                 # Extract video height and width
 941                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 942                 if mobj is None:
 943                         self._downloader.trouble(u'ERROR: unable to extract video height')
 944                         return
 945                 yv_video_height = mobj.group(1)
 946
 947                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 948                 if mobj is None:
 949                         self._downloader.trouble(u'ERROR: unable to extract video width')
 950                         return
 951                 yv_video_width = mobj.group(1)
 952
 953                 # Retrieve video playlist to extract media URL
 954                 # I'm not completely sure what all these options are, but we
 955                 # seem to need most of them, otherwise the server sends a 401.
 956                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 957                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 958                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 959                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 960                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 961                 try:
 962                         self.report_download_webpage(video_id)
 963                         webpage = urllib2.urlopen(request).read()
 964                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 965                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 966                         return
 967
 968                 # Extract media URL from playlist XML
 969                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 970                 if mobj is None:
 971                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 972                         return
 973                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 974                 video_url = unescapeHTML(video_url)
 975
 976                 return [{
 977                         'id':           video_id.decode('utf-8'),
 978                         'url':          video_url,
 979                         'uploader':     video_uploader,
 980                         'upload_date':  u'NA',
 981                         'title':        video_title,
 982                         'stitle':       simple_title,
 983                         'ext':          video_extension.decode('utf-8'),
 984                         'thumbnail':    video_thumbnail.decode('utf-8'),
 985                         'description':  video_description,
 986                         'thumbnail':    video_thumbnail,
 987                         'player_url':   None,
 988                 }]
 989
 990
 991 class VimeoIE(InfoExtractor):
 992         """Information extractor for vimeo.com."""
 993
 994         # _VALID_URL matches Vimeo URLs
 995         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 996         IE_NAME = u'vimeo'
 997
 998         def __init__(self, downloader=None):
 999                 InfoExtractor.__init__(self, downloader)
1000
1001         def report_download_webpage(self, video_id):
1002                 """Report webpage download."""
1003                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1004
1005         def report_extraction(self, video_id):
1006                 """Report information extraction."""
1007                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1008
1009         def _real_extract(self, url, new_video=True):
1010                 # Extract ID from URL
1011                 mobj = re.match(self._VALID_URL, url)
1012                 if mobj is None:
1013                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1014                         return
1015
1016                 video_id = mobj.group(1)
1017
1018                 # Retrieve video webpage to extract further information
1019                 request = urllib2.Request(url, None, std_headers)
1020                 try:
1021                         self.report_download_webpage(video_id)
1022                         webpage = urllib2.urlopen(request).read()
1023                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1024                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1025                         return
1026
1027                 # Now we begin extracting as much information as we can from what we
1028                 # retrieved. First we extract the information common to all extractors,
1029                 # and latter we extract those that are Vimeo specific.
1030                 self.report_extraction(video_id)
1031
1032                 # Extract the config JSON
1033                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1034                 try:
1035                         config = json.loads(config)
1036                 except:
1037                         self._downloader.trouble(u'ERROR: unable to extract info section')
1038                         return
1039
1040                 # Extract title
1041                 video_title = config["video"]["title"]
1042                 simple_title = simplify_title(video_title)
1043
1044                 # Extract uploader
1045                 video_uploader = config["video"]["owner"]["name"]
1046
1047                 # Extract video thumbnail
1048                 video_thumbnail = config["video"]["thumbnail"]
1049
1050                 # Extract video description
1051                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1052                 if video_description: video_description = clean_html(video_description)
1053                 else: video_description = ''
1054
1055                 # Extract upload date
1056                 video_upload_date = u'NA'
1057                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1058                 if mobj is not None:
1059                         video_upload_date = mobj.group(1)
1060
1061                 # Vimeo specific: extract request signature and timestamp
1062                 sig = config['request']['signature']
1063                 timestamp = config['request']['timestamp']
1064
1065                 # Vimeo specific: extract video codec and quality information
1066                 # TODO bind to format param
1067                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1068                 for codec in codecs:
1069                         if codec[0] in config["video"]["files"]:
1070                                 video_codec = codec[0]
1071                                 video_extension = codec[1]
1072                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1073                                 else: quality = 'sd'
1074                                 break
1075                 else:
1076                         self._downloader.trouble(u'ERROR: no known codec found')
1077                         return
1078
1079                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1080                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1081
1082                 return [{
1083                         'id':           video_id,
1084                         'url':          video_url,
1085                         'uploader':     video_uploader,
1086                         'upload_date':  video_upload_date,
1087                         'title':        video_title,
1088                         'stitle':       simple_title,
1089                         'ext':          video_extension,
1090                         'thumbnail':    video_thumbnail,
1091                         'description':  video_description,
1092                         'player_url':   None,
1093                 }]
1094
1095
1096 class GenericIE(InfoExtractor):
1097         """Generic last-resort information extractor."""
1098
1099         _VALID_URL = r'.*'
1100         IE_NAME = u'generic'
1101
1102         def __init__(self, downloader=None):
1103                 InfoExtractor.__init__(self, downloader)
1104
1105         def report_download_webpage(self, video_id):
1106                 """Report webpage download."""
1107                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1108                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1109
1110         def report_extraction(self, video_id):
1111                 """Report information extraction."""
1112                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1113
1114         def report_following_redirect(self, new_url):
1115                 """Report information extraction."""
1116                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1117
1118         def _test_redirect(self, url):
1119                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1120                 class HeadRequest(urllib2.Request):
1121                         def get_method(self):
1122                                 return "HEAD"
1123
1124                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1125                         """
1126                         Subclass the HTTPRedirectHandler to make it use our
1127                         HeadRequest also on the redirected URL
1128                         """
1129                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1130                                 if code in (301, 302, 303, 307):
1131                                         newurl = newurl.replace(' ', '%20')
1132                                         newheaders = dict((k,v) for k,v in req.headers.items()
1133                                                                           if k.lower() not in ("content-length", "content-type"))
1134                                         return HeadRequest(newurl,
1135                                                                            headers=newheaders,
1136                                                                            origin_req_host=req.get_origin_req_host(),
1137                                                                            unverifiable=True)
1138                                 else:
1139                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1140
1141                 class HTTPMethodFallback(urllib2.BaseHandler):
1142                         """
1143                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1144                         """
1145                         def http_error_405(self, req, fp, code, msg, headers):
1146                                 fp.read()
1147                                 fp.close()
1148
1149                                 newheaders = dict((k,v) for k,v in req.headers.items()
1150                                                                   if k.lower() not in ("content-length", "content-type"))
1151                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1152                                                                                                  headers=newheaders,
1153                                                                                                  origin_req_host=req.get_origin_req_host(),
1154                                                                                                  unverifiable=True))
1155
1156                 # Build our opener
1157                 opener = urllib2.OpenerDirector()
1158                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1159                                                 HTTPMethodFallback, HEADRedirectHandler,
1160                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1161                         opener.add_handler(handler())
1162
1163                 response = opener.open(HeadRequest(url))
1164                 new_url = response.geturl()
1165
1166                 if url == new_url: return False
1167
1168                 self.report_following_redirect(new_url)
1169                 self._downloader.download([new_url])
1170                 return True
1171
1172         def _real_extract(self, url):
1173                 if self._test_redirect(url): return
1174
1175                 video_id = url.split('/')[-1]
1176                 request = urllib2.Request(url)
1177                 try:
1178                         self.report_download_webpage(video_id)
1179                         webpage = urllib2.urlopen(request).read()
1180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1182                         return
1183                 except ValueError, err:
1184                         # since this is the last-resort InfoExtractor, if
1185                         # this error is thrown, it'll be thrown here
1186                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187                         return
1188
1189                 self.report_extraction(video_id)
1190                 # Start with something easy: JW Player in SWFObject
1191                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1192                 if mobj is None:
1193                         # Broaden the search a little bit
1194                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1197                         return
1198
1199                 # It's possible that one of the regexes
1200                 # matched, but returned an empty group:
1201                 if mobj.group(1) is None:
1202                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1203                         return
1204
1205                 video_url = urllib.unquote(mobj.group(1))
1206                 video_id = os.path.basename(video_url)
1207
1208                 # here's a fun little line of code for you:
1209                 video_extension = os.path.splitext(video_id)[1][1:]
1210                 video_id = os.path.splitext(video_id)[0]
1211
1212                 # it's tempting to parse this further, but you would
1213                 # have to take into account all the variations like
1214                 #   Video Title - Site Name
1215                 #   Site Name | Video Title
1216                 #   Video Title - Tagline | Site Name
1217                 # and so on and so forth; it's just not practical
1218                 mobj = re.search(r'<title>(.*)</title>', webpage)
1219                 if mobj is None:
1220                         self._downloader.trouble(u'ERROR: unable to extract title')
1221                         return
1222                 video_title = mobj.group(1).decode('utf-8')
1223                 video_title = sanitize_title(video_title)
1224                 simple_title = simplify_title(video_title)
1225
1226                 # video uploader is domain name
1227                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1228                 if mobj is None:
1229                         self._downloader.trouble(u'ERROR: unable to extract title')
1230                         return
1231                 video_uploader = mobj.group(1).decode('utf-8')
1232
1233                 return [{
1234                         'id':           video_id.decode('utf-8'),
1235                         'url':          video_url.decode('utf-8'),
1236                         'uploader':     video_uploader,
1237                         'upload_date':  u'NA',
1238                         'title':        video_title,
1239                         'stitle':       simple_title,
1240                         'ext':          video_extension.decode('utf-8'),
1241                         'format':       u'NA',
1242                         'player_url':   None,
1243                 }]
1244
1245
1246 class YoutubeSearchIE(InfoExtractor):
1247         """Information Extractor for YouTube search queries."""
1248         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1249         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1250         _max_youtube_results = 1000
1251         IE_NAME = u'youtube:search'
1252
1253         def __init__(self, downloader=None):
1254                 InfoExtractor.__init__(self, downloader)
1255
1256         def report_download_page(self, query, pagenum):
1257                 """Report attempt to download playlist page with given number."""
1258                 query = query.decode(preferredencoding())
1259                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1260
1261         def _real_extract(self, query):
1262                 mobj = re.match(self._VALID_URL, query)
1263                 if mobj is None:
1264                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1265                         return
1266
1267                 prefix, query = query.split(':')
1268                 prefix = prefix[8:]
1269                 query = query.encode('utf-8')
1270                 if prefix == '':
1271                         self._download_n_results(query, 1)
1272                         return
1273                 elif prefix == 'all':
1274                         self._download_n_results(query, self._max_youtube_results)
1275                         return
1276                 else:
1277                         try:
1278                                 n = long(prefix)
1279                                 if n <= 0:
1280                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1281                                         return
1282                                 elif n > self._max_youtube_results:
1283                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1284                                         n = self._max_youtube_results
1285                                 self._download_n_results(query, n)
1286                                 return
1287                         except ValueError: # parsing prefix as integer fails
1288                                 self._download_n_results(query, 1)
1289                                 return
1290
1291         def _download_n_results(self, query, n):
1292                 """Downloads a specified number of results for a query"""
1293
1294                 video_ids = []
1295                 pagenum = 0
1296                 limit = n
1297
1298                 while (50 * pagenum) < limit:
1299                         self.report_download_page(query, pagenum+1)
1300                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1301                         request = urllib2.Request(result_url)
1302                         try:
1303                                 data = urllib2.urlopen(request).read()
1304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1305                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1306                                 return
1307                         api_response = json.loads(data)['data']
1308
1309                         new_ids = list(video['id'] for video in api_response['items'])
1310                         video_ids += new_ids
1311
1312                         limit = min(n, api_response['totalItems'])
1313                         pagenum += 1
1314
1315                 if len(video_ids) > n:
1316                         video_ids = video_ids[:n]
1317                 for id in video_ids:
1318                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1319                 return
1320
1321
1322 class GoogleSearchIE(InfoExtractor):
1323         """Information Extractor for Google Video search queries."""
1324         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1325         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1326         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1327         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1328         _max_google_results = 1000
1329         IE_NAME = u'video.google:search'
1330
1331         def __init__(self, downloader=None):
1332                 InfoExtractor.__init__(self, downloader)
1333
1334         def report_download_page(self, query, pagenum):
1335                 """Report attempt to download playlist page with given number."""
1336                 query = query.decode(preferredencoding())
1337                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1338
1339         def _real_extract(self, query):
1340                 mobj = re.match(self._VALID_URL, query)
1341                 if mobj is None:
1342                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1343                         return
1344
1345                 prefix, query = query.split(':')
1346                 prefix = prefix[8:]
1347                 query = query.encode('utf-8')
1348                 if prefix == '':
1349                         self._download_n_results(query, 1)
1350                         return
1351                 elif prefix == 'all':
1352                         self._download_n_results(query, self._max_google_results)
1353                         return
1354                 else:
1355                         try:
1356                                 n = long(prefix)
1357                                 if n <= 0:
1358                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1359                                         return
1360                                 elif n > self._max_google_results:
1361                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1362                                         n = self._max_google_results
1363                                 self._download_n_results(query, n)
1364                                 return
1365                         except ValueError: # parsing prefix as integer fails
1366                                 self._download_n_results(query, 1)
1367                                 return
1368
1369         def _download_n_results(self, query, n):
1370                 """Downloads a specified number of results for a query"""
1371
1372                 video_ids = []
1373                 pagenum = 0
1374
1375                 while True:
1376                         self.report_download_page(query, pagenum)
1377                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1378                         request = urllib2.Request(result_url)
1379                         try:
1380                                 page = urllib2.urlopen(request).read()
1381                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1382                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1383                                 return
1384
1385                         # Extract video identifiers
1386                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1387                                 video_id = mobj.group(1)
1388                                 if video_id not in video_ids:
1389                                         video_ids.append(video_id)
1390                                         if len(video_ids) == n:
1391                                                 # Specified n videos reached
1392                                                 for id in video_ids:
1393                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1394                                                 return
1395
1396                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1397                                 for id in video_ids:
1398                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1399                                 return
1400
1401                         pagenum = pagenum + 1
1402
1403
1404 class YahooSearchIE(InfoExtractor):
1405         """Information Extractor for Yahoo! Video search queries."""
1406         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1407         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1408         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1409         _MORE_PAGES_INDICATOR = r'\s*Next'
1410         _max_yahoo_results = 1000
1411         IE_NAME = u'video.yahoo:search'
1412
1413         def __init__(self, downloader=None):
1414                 InfoExtractor.__init__(self, downloader)
1415
1416         def report_download_page(self, query, pagenum):
1417                 """Report attempt to download playlist page with given number."""
1418                 query = query.decode(preferredencoding())
1419                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1420
1421         def _real_extract(self, query):
1422                 mobj = re.match(self._VALID_URL, query)
1423                 if mobj is None:
1424                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1425                         return
1426
1427                 prefix, query = query.split(':')
1428                 prefix = prefix[8:]
1429                 query = query.encode('utf-8')
1430                 if prefix == '':
1431                         self._download_n_results(query, 1)
1432                         return
1433                 elif prefix == 'all':
1434                         self._download_n_results(query, self._max_yahoo_results)
1435                         return
1436                 else:
1437                         try:
1438                                 n = long(prefix)
1439                                 if n <= 0:
1440                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1441                                         return
1442                                 elif n > self._max_yahoo_results:
1443                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1444                                         n = self._max_yahoo_results
1445                                 self._download_n_results(query, n)
1446                                 return
1447                         except ValueError: # parsing prefix as integer fails
1448                                 self._download_n_results(query, 1)
1449                                 return
1450
1451         def _download_n_results(self, query, n):
1452                 """Downloads a specified number of results for a query"""
1453
1454                 video_ids = []
1455                 already_seen = set()
1456                 pagenum = 1
1457
1458                 while True:
1459                         self.report_download_page(query, pagenum)
1460                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1461                         request = urllib2.Request(result_url)
1462                         try:
1463                                 page = urllib2.urlopen(request).read()
1464                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1465                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1466                                 return
1467
1468                         # Extract video identifiers
1469                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1470                                 video_id = mobj.group(1)
1471                                 if video_id not in already_seen:
1472                                         video_ids.append(video_id)
1473                                         already_seen.add(video_id)
1474                                         if len(video_ids) == n:
1475                                                 # Specified n videos reached
1476                                                 for id in video_ids:
1477                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1478                                                 return
1479
1480                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1481                                 for id in video_ids:
1482                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1483                                 return
1484
1485                         pagenum = pagenum + 1
1486
1487
1488 class YoutubePlaylistIE(InfoExtractor):
1489         """Information Extractor for YouTube playlists."""
1490
1491         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1492         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1493         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
1494         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1495         IE_NAME = u'youtube:playlist'
1496
1497         def __init__(self, downloader=None):
1498                 InfoExtractor.__init__(self, downloader)
1499
1500         def report_download_page(self, playlist_id, pagenum):
1501                 """Report attempt to download playlist page with given number."""
1502                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1503
1504         def _real_extract(self, url):
1505                 # Extract playlist id
1506                 mobj = re.match(self._VALID_URL, url)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1509                         return
1510
1511                 # Single video case
1512                 if mobj.group(3) is not None:
1513                         self._downloader.download([mobj.group(3)])
1514                         return
1515
1516                 # Download playlist pages
1517                 # prefix is 'p' as default for playlists but there are other types that need extra care
1518                 playlist_prefix = mobj.group(1)
1519                 if playlist_prefix == 'a':
1520                         playlist_access = 'artist'
1521                 else:
1522                         playlist_prefix = 'p'
1523                         playlist_access = 'view_play_list'
1524                 playlist_id = mobj.group(2)
1525                 video_ids = []
1526                 pagenum = 1
1527
1528                 while True:
1529                         self.report_download_page(playlist_id, pagenum)
1530                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1531                         request = urllib2.Request(url)
1532                         try:
1533                                 page = urllib2.urlopen(request).read()
1534                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1535                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1536                                 return
1537
1538                         # Extract video identifiers
1539                         ids_in_page = []
1540                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1541                                 if mobj.group(1) not in ids_in_page:
1542                                         ids_in_page.append(mobj.group(1))
1543                         video_ids.extend(ids_in_page)
1544
1545                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1546                                 break
1547                         pagenum = pagenum + 1
1548
1549                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1550                 playlistend = self._downloader.params.get('playlistend', -1)
1551                 if playlistend == -1:
1552                         video_ids = video_ids[playliststart:]
1553                 else:
1554                         video_ids = video_ids[playliststart:playlistend]
1555
1556                 for id in video_ids:
1557                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1558                 return
1559
1560
1561 class YoutubeUserIE(InfoExtractor):
1562         """Information Extractor for YouTube users."""
1563
1564         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1565         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1566         _GDATA_PAGE_SIZE = 50
1567         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1568         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1569         IE_NAME = u'youtube:user'
1570
1571         def __init__(self, downloader=None):
1572                 InfoExtractor.__init__(self, downloader)
1573
1574         def report_download_page(self, username, start_index):
1575                 """Report attempt to download user page."""
1576                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1577                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1578
1579         def _real_extract(self, url):
1580                 # Extract username
1581                 mobj = re.match(self._VALID_URL, url)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1584                         return
1585
1586                 username = mobj.group(1)
1587
1588                 # Download video ids using YouTube Data API. Result size per
1589                 # query is limited (currently to 50 videos) so we need to query
1590                 # page by page until there are no video ids - it means we got
1591                 # all of them.
1592
1593                 video_ids = []
1594                 pagenum = 0
1595
1596                 while True:
1597                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1598                         self.report_download_page(username, start_index)
1599
1600                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1601
1602                         try:
1603                                 page = urllib2.urlopen(request).read()
1604                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1605                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1606                                 return
1607
1608                         # Extract video identifiers
1609                         ids_in_page = []
1610
1611                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1612                                 if mobj.group(1) not in ids_in_page:
1613                                         ids_in_page.append(mobj.group(1))
1614
1615                         video_ids.extend(ids_in_page)
1616
1617                         # A little optimization - if current page is not
1618                         # "full", ie. does not contain PAGE_SIZE video ids then
1619                         # we can assume that this page is the last one - there
1620                         # are no more ids on further pages - no need to query
1621                         # again.
1622
1623                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1624                                 break
1625
1626                         pagenum += 1
1627
1628                 all_ids_count = len(video_ids)
1629                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1630                 playlistend = self._downloader.params.get('playlistend', -1)
1631
1632                 if playlistend == -1:
1633                         video_ids = video_ids[playliststart:]
1634                 else:
1635                         video_ids = video_ids[playliststart:playlistend]
1636
1637                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1638                                 (username, all_ids_count, len(video_ids)))
1639
1640                 for video_id in video_ids:
1641                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1642
1643
1644 class DepositFilesIE(InfoExtractor):
1645         """Information extractor for depositfiles.com"""
1646
1647         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1648         IE_NAME = u'DepositFiles'
1649
1650         def __init__(self, downloader=None):
1651                 InfoExtractor.__init__(self, downloader)
1652
1653         def report_download_webpage(self, file_id):
1654                 """Report webpage download."""
1655                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1656
1657         def report_extraction(self, file_id):
1658                 """Report information extraction."""
1659                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1660
1661         def _real_extract(self, url):
1662                 file_id = url.split('/')[-1]
1663                 # Rebuild url in english locale
1664                 url = 'http://depositfiles.com/en/files/' + file_id
1665
1666                 # Retrieve file webpage with 'Free download' button pressed
1667                 free_download_indication = { 'gateway_result' : '1' }
1668                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1669                 try:
1670                         self.report_download_webpage(file_id)
1671                         webpage = urllib2.urlopen(request).read()
1672                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1673                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1674                         return
1675
1676                 # Search for the real file URL
1677                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1678                 if (mobj is None) or (mobj.group(1) is None):
1679                         # Try to figure out reason of the error.
1680                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1681                         if (mobj is not None) and (mobj.group(1) is not None):
1682                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1683                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1684                         else:
1685                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1686                         return
1687
1688                 file_url = mobj.group(1)
1689                 file_extension = os.path.splitext(file_url)[1][1:]
1690
1691                 # Search for file title
1692                 mobj = re.search(r'<b title="(.*?)">', webpage)
1693                 if mobj is None:
1694                         self._downloader.trouble(u'ERROR: unable to extract title')
1695                         return
1696                 file_title = mobj.group(1).decode('utf-8')
1697
1698                 return [{
1699                         'id':           file_id.decode('utf-8'),
1700                         'url':          file_url.decode('utf-8'),
1701                         'uploader':     u'NA',
1702                         'upload_date':  u'NA',
1703                         'title':        file_title,
1704                         'stitle':       file_title,
1705                         'ext':          file_extension.decode('utf-8'),
1706                         'format':       u'NA',
1707                         'player_url':   None,
1708                 }]
1709
1710
1711 class FacebookIE(InfoExtractor):
1712         """Information Extractor for Facebook"""
1713
1714         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1715         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1716         _NETRC_MACHINE = 'facebook'
1717         _available_formats = ['video', 'highqual', 'lowqual']
1718         _video_extensions = {
1719                 'video': 'mp4',
1720                 'highqual': 'mp4',
1721                 'lowqual': 'mp4',
1722         }
1723         IE_NAME = u'facebook'
1724
1725         def __init__(self, downloader=None):
1726                 InfoExtractor.__init__(self, downloader)
1727
1728         def _reporter(self, message):
1729                 """Add header and report message."""
1730                 self._downloader.to_screen(u'[facebook] %s' % message)
1731
1732         def report_login(self):
1733                 """Report attempt to log in."""
1734                 self._reporter(u'Logging in')
1735
1736         def report_video_webpage_download(self, video_id):
1737                 """Report attempt to download video webpage."""
1738                 self._reporter(u'%s: Downloading video webpage' % video_id)
1739
1740         def report_information_extraction(self, video_id):
1741                 """Report attempt to extract video information."""
1742                 self._reporter(u'%s: Extracting video information' % video_id)
1743
1744         def _parse_page(self, video_webpage):
1745                 """Extract video information from page"""
1746                 # General data
1747                 data = {'title': r'\("video_title", "(.*?)"\)',
1748                         'description': r'<div class="datawrap">(.*?)</div>',
1749                         'owner': r'\("video_owner_name", "(.*?)"\)',
1750                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1751                         }
1752                 video_info = {}
1753                 for piece in data.keys():
1754                         mobj = re.search(data[piece], video_webpage)
1755                         if mobj is not None:
1756                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1757
1758                 # Video urls
1759                 video_urls = {}
1760                 for fmt in self._available_formats:
1761                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1762                         if mobj is not None:
1763                                 # URL is in a Javascript segment inside an escaped Unicode format within
1764                                 # the generally utf-8 page
1765                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1766                 video_info['video_urls'] = video_urls
1767
1768                 return video_info
1769
1770         def _real_initialize(self):
1771                 if self._downloader is None:
1772                         return
1773
1774                 useremail = None
1775                 password = None
1776                 downloader_params = self._downloader.params
1777
1778                 # Attempt to use provided username and password or .netrc data
1779                 if downloader_params.get('username', None) is not None:
1780                         useremail = downloader_params['username']
1781                         password = downloader_params['password']
1782                 elif downloader_params.get('usenetrc', False):
1783                         try:
1784                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1785                                 if info is not None:
1786                                         useremail = info[0]
1787                                         password = info[2]
1788                                 else:
1789                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1790                         except (IOError, netrc.NetrcParseError), err:
1791                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1792                                 return
1793
1794                 if useremail is None:
1795                         return
1796
1797                 # Log in
1798                 login_form = {
1799                         'email': useremail,
1800                         'pass': password,
1801                         'login': 'Log+In'
1802                         }
1803                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1804                 try:
1805                         self.report_login()
1806                         login_results = urllib2.urlopen(request).read()
1807                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1808                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1809                                 return
1810                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1811                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1812                         return
1813
1814         def _real_extract(self, url):
1815                 mobj = re.match(self._VALID_URL, url)
1816                 if mobj is None:
1817                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1818                         return
1819                 video_id = mobj.group('ID')
1820
1821                 # Get video webpage
1822                 self.report_video_webpage_download(video_id)
1823                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1824                 try:
1825                         page = urllib2.urlopen(request)
1826                         video_webpage = page.read()
1827                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1828                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1829                         return
1830
1831                 # Start extracting information
1832                 self.report_information_extraction(video_id)
1833
1834                 # Extract information
1835                 video_info = self._parse_page(video_webpage)
1836
1837                 # uploader
1838                 if 'owner' not in video_info:
1839                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1840                         return
1841                 video_uploader = video_info['owner']
1842
1843                 # title
1844                 if 'title' not in video_info:
1845                         self._downloader.trouble(u'ERROR: unable to extract video title')
1846                         return
1847                 video_title = video_info['title']
1848                 video_title = video_title.decode('utf-8')
1849                 video_title = sanitize_title(video_title)
1850
1851                 simple_title = simplify_title(video_title)
1852
1853                 # thumbnail image
1854                 if 'thumbnail' not in video_info:
1855                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1856                         video_thumbnail = ''
1857                 else:
1858                         video_thumbnail = video_info['thumbnail']
1859
1860                 # upload date
1861                 upload_date = u'NA'
1862                 if 'upload_date' in video_info:
1863                         upload_time = video_info['upload_date']
1864                         timetuple = email.utils.parsedate_tz(upload_time)
1865                         if timetuple is not None:
1866                                 try:
1867                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1868                                 except:
1869                                         pass
1870
1871                 # description
1872                 video_description = video_info.get('description', 'No description available.')
1873
1874                 url_map = video_info['video_urls']
1875                 if len(url_map.keys()) > 0:
1876                         # Decide which formats to download
1877                         req_format = self._downloader.params.get('format', None)
1878                         format_limit = self._downloader.params.get('format_limit', None)
1879
1880                         if format_limit is not None and format_limit in self._available_formats:
1881                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1882                         else:
1883                                 format_list = self._available_formats
1884                         existing_formats = [x for x in format_list if x in url_map]
1885                         if len(existing_formats) == 0:
1886                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1887                                 return
1888                         if req_format is None:
1889                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1890                         elif req_format == 'worst':
1891                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1892                         elif req_format == '-1':
1893                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1894                         else:
1895                                 # Specific format
1896                                 if req_format not in url_map:
1897                                         self._downloader.trouble(u'ERROR: requested format not available')
1898                                         return
1899                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1900
1901                 results = []
1902                 for format_param, video_real_url in video_url_list:
1903                         # Extension
1904                         video_extension = self._video_extensions.get(format_param, 'mp4')
1905
1906                         results.append({
1907                                 'id':           video_id.decode('utf-8'),
1908                                 'url':          video_real_url.decode('utf-8'),
1909                                 'uploader':     video_uploader.decode('utf-8'),
1910                                 'upload_date':  upload_date,
1911                                 'title':        video_title,
1912                                 'stitle':       simple_title,
1913                                 'ext':          video_extension.decode('utf-8'),
1914                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1915                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1916                                 'description':  video_description.decode('utf-8'),
1917                                 'player_url':   None,
1918                         })
1919                 return results
1920
1921 class BlipTVIE(InfoExtractor):
1922         """Information extractor for blip.tv"""
1923
1924         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1925         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1926         IE_NAME = u'blip.tv'
1927
1928         def report_extraction(self, file_id):
1929                 """Report information extraction."""
1930                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1931
1932         def report_direct_download(self, title):
1933                 """Report information extraction."""
1934                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1935
1936         def _real_extract(self, url):
1937                 mobj = re.match(self._VALID_URL, url)
1938                 if mobj is None:
1939                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1940                         return
1941
1942                 if '?' in url:
1943                         cchar = '&'
1944                 else:
1945                         cchar = '?'
1946                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1947                 request = urllib2.Request(json_url)
1948                 self.report_extraction(mobj.group(1))
1949                 info = None
1950                 try:
1951                         urlh = urllib2.urlopen(request)
1952                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1953                                 basename = url.split('/')[-1]
1954                                 title,ext = os.path.splitext(basename)
1955                                 title = title.decode('UTF-8')
1956                                 ext = ext.replace('.', '')
1957                                 self.report_direct_download(title)
1958                                 info = {
1959                                         'id': title,
1960                                         'url': url,
1961                                         'title': title,
1962                                         'stitle': simplify_title(title),
1963                                         'ext': ext,
1964                                         'urlhandle': urlh
1965                                 }
1966                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1967                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1968                         return
1969                 if info is None: # Regular URL
1970                         try:
1971                                 json_code = urlh.read()
1972                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1974                                 return
1975
1976                         try:
1977                                 json_data = json.loads(json_code)
1978                                 if 'Post' in json_data:
1979                                         data = json_data['Post']
1980                                 else:
1981                                         data = json_data
1982
1983                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1984                                 video_url = data['media']['url']
1985                                 umobj = re.match(self._URL_EXT, video_url)
1986                                 if umobj is None:
1987                                         raise ValueError('Can not determine filename extension')
1988                                 ext = umobj.group(1)
1989
1990                                 info = {
1991                                         'id': data['item_id'],
1992                                         'url': video_url,
1993                                         'uploader': data['display_name'],
1994                                         'upload_date': upload_date,
1995                                         'title': data['title'],
1996                                         'stitle': simplify_title(data['title']),
1997                                         'ext': ext,
1998                                         'format': data['media']['mimeType'],
1999                                         'thumbnail': data['thumbnailUrl'],
2000                                         'description': data['description'],
2001                                         'player_url': data['embedUrl']
2002                                 }
2003                         except (ValueError,KeyError), err:
2004                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2005                                 return
2006
2007                 return [info]
2008
2009
2010 class MyVideoIE(InfoExtractor):
2011         """Information Extractor for myvideo.de."""
2012
2013         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2014         IE_NAME = u'myvideo'
2015
2016         def __init__(self, downloader=None):
2017                 InfoExtractor.__init__(self, downloader)
2018
2019         def report_download_webpage(self, video_id):
2020                 """Report webpage download."""
2021                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2022
2023         def report_extraction(self, video_id):
2024                 """Report information extraction."""
2025                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2026
2027         def _real_extract(self,url):
2028                 mobj = re.match(self._VALID_URL, url)
2029                 if mobj is None:
2030                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2031                         return
2032
2033                 video_id = mobj.group(1)
2034
2035                 # Get video webpage
2036                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2037                 try:
2038                         self.report_download_webpage(video_id)
2039                         webpage = urllib2.urlopen(request).read()
2040                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2041                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2042                         return
2043
2044                 self.report_extraction(video_id)
2045                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2046                                  webpage)
2047                 if mobj is None:
2048                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2049                         return
2050                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2051
2052                 mobj = re.search('<title>([^<]+)</title>', webpage)
2053                 if mobj is None:
2054                         self._downloader.trouble(u'ERROR: unable to extract title')
2055                         return
2056
2057                 video_title = mobj.group(1)
2058                 video_title = sanitize_title(video_title)
2059
2060                 simple_title = simplify_title(video_title)
2061
2062                 return [{
2063                         'id':           video_id,
2064                         'url':          video_url,
2065                         'uploader':     u'NA',
2066                         'upload_date':  u'NA',
2067                         'title':        video_title,
2068                         'stitle':       simple_title,
2069                         'ext':          u'flv',
2070                         'format':       u'NA',
2071                         'player_url':   None,
2072                 }]
2073
2074 class ComedyCentralIE(InfoExtractor):
2075         """Information extractor for The Daily Show and Colbert Report """
2076
2077         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2078         IE_NAME = u'comedycentral'
2079
2080         def report_extraction(self, episode_id):
2081                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2082
2083         def report_config_download(self, episode_id):
2084                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2085
2086         def report_index_download(self, episode_id):
2087                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2088
2089         def report_player_url(self, episode_id):
2090                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2091
2092         def _real_extract(self, url):
2093                 mobj = re.match(self._VALID_URL, url)
2094                 if mobj is None:
2095                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2096                         return
2097
2098                 if mobj.group('shortname'):
2099                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2100                                 url = u'http://www.thedailyshow.com/full-episodes/'
2101                         else:
2102                                 url = u'http://www.colbertnation.com/full-episodes/'
2103                         mobj = re.match(self._VALID_URL, url)
2104                         assert mobj is not None
2105
2106                 dlNewest = not mobj.group('episode')
2107                 if dlNewest:
2108                         epTitle = mobj.group('showname')
2109                 else:
2110                         epTitle = mobj.group('episode')
2111
2112                 req = urllib2.Request(url)
2113                 self.report_extraction(epTitle)
2114                 try:
2115                         htmlHandle = urllib2.urlopen(req)
2116                         html = htmlHandle.read()
2117                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2119                         return
2120                 if dlNewest:
2121                         url = htmlHandle.geturl()
2122                         mobj = re.match(self._VALID_URL, url)
2123                         if mobj is None:
2124                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2125                                 return
2126                         if mobj.group('episode') == '':
2127                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2128                                 return
2129                         epTitle = mobj.group('episode')
2130
2131                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2132                 if len(mMovieParams) == 0:
2133                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2134                         return
2135
2136                 playerUrl_raw = mMovieParams[0][0]
2137                 self.report_player_url(epTitle)
2138                 try:
2139                         urlHandle = urllib2.urlopen(playerUrl_raw)
2140                         playerUrl = urlHandle.geturl()
2141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2142                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2143                         return
2144
2145                 uri = mMovieParams[0][1]
2146                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2147                 self.report_index_download(epTitle)
2148                 try:
2149                         indexXml = urllib2.urlopen(indexUrl).read()
2150                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2151                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2152                         return
2153
2154                 results = []
2155
2156                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2157                 itemEls = idoc.findall('.//item')
2158                 for itemEl in itemEls:
2159                         mediaId = itemEl.findall('./guid')[0].text
2160                         shortMediaId = mediaId.split(':')[-1]
2161                         showId = mediaId.split(':')[-2].replace('.com', '')
2162                         officialTitle = itemEl.findall('./title')[0].text
2163                         officialDate = itemEl.findall('./pubDate')[0].text
2164
2165                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2166                                                 urllib.urlencode({'uri': mediaId}))
2167                         configReq = urllib2.Request(configUrl)
2168                         self.report_config_download(epTitle)
2169                         try:
2170                                 configXml = urllib2.urlopen(configReq).read()
2171                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2172                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2173                                 return
2174
2175                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2176                         turls = []
2177                         for rendition in cdoc.findall('.//rendition'):
2178                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2179                                 turls.append(finfo)
2180
2181                         if len(turls) == 0:
2182                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2183                                 continue
2184
2185                         # For now, just pick the highest bitrate
2186                         format,video_url = turls[-1]
2187
2188                         effTitle = showId + u'-' + epTitle
2189                         info = {
2190                                 'id': shortMediaId,
2191                                 'url': video_url,
2192                                 'uploader': showId,
2193                                 'upload_date': officialDate,
2194                                 'title': effTitle,
2195                                 'stitle': simplify_title(effTitle),
2196                                 'ext': 'mp4',
2197                                 'format': format,
2198                                 'thumbnail': None,
2199                                 'description': officialTitle,
2200                                 'player_url': playerUrl
2201                         }
2202
2203                         results.append(info)
2204
2205                 return results
2206
2207
2208 class EscapistIE(InfoExtractor):
2209         """Information extractor for The Escapist """
2210
2211         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2212         IE_NAME = u'escapist'
2213
2214         def report_extraction(self, showName):
2215                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2216
2217         def report_config_download(self, showName):
2218                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2219
2220         def _real_extract(self, url):
2221                 mobj = re.match(self._VALID_URL, url)
2222                 if mobj is None:
2223                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2224                         return
2225                 showName = mobj.group('showname')
2226                 videoId = mobj.group('episode')
2227
2228                 self.report_extraction(showName)
2229                 try:
2230                         webPage = urllib2.urlopen(url).read()
2231                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2232                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2233                         return
2234
2235                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2236                 description = unescapeHTML(descMatch.group(1))
2237                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2238                 imgUrl = unescapeHTML(imgMatch.group(1))
2239                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2240                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2241                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2242                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2243
2244                 self.report_config_download(showName)
2245                 try:
2246                         configJSON = urllib2.urlopen(configUrl).read()
2247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2248                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2249                         return
2250
2251                 # Technically, it's JavaScript, not JSON
2252                 configJSON = configJSON.replace("'", '"')
2253
2254                 try:
2255                         config = json.loads(configJSON)
2256                 except (ValueError,), err:
2257                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2258                         return
2259
2260                 playlist = config['playlist']
2261                 videoUrl = playlist[1]['url']
2262
2263                 info = {
2264                         'id': videoId,
2265                         'url': videoUrl,
2266                         'uploader': showName,
2267                         'upload_date': None,
2268                         'title': showName,
2269                         'stitle': simplify_title(showName),
2270                         'ext': 'flv',
2271                         'format': 'flv',
2272                         'thumbnail': imgUrl,
2273                         'description': description,
2274                         'player_url': playerUrl,
2275                 }
2276
2277                 return [info]
2278
2279
2280 class CollegeHumorIE(InfoExtractor):
2281         """Information extractor for collegehumor.com"""
2282
2283         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2284         IE_NAME = u'collegehumor'
2285
2286         def report_webpage(self, video_id):
2287                 """Report information extraction."""
2288                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2289
2290         def report_extraction(self, video_id):
2291                 """Report information extraction."""
2292                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2293
2294         def _real_extract(self, url):
2295                 mobj = re.match(self._VALID_URL, url)
2296                 if mobj is None:
2297                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2298                         return
2299                 video_id = mobj.group('videoid')
2300
2301                 self.report_webpage(video_id)
2302                 request = urllib2.Request(url)
2303                 try:
2304                         webpage = urllib2.urlopen(request).read()
2305                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2306                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2307                         return
2308
2309                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2310                 if m is None:
2311                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2312                         return
2313                 internal_video_id = m.group('internalvideoid')
2314
2315                 info = {
2316                         'id': video_id,
2317                         'internal_id': internal_video_id,
2318                 }
2319
2320                 self.report_extraction(video_id)
2321                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2322                 try:
2323                         metaXml = urllib2.urlopen(xmlUrl).read()
2324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2325                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2326                         return
2327
2328                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2329                 try:
2330                         videoNode = mdoc.findall('./video')[0]
2331                         info['description'] = videoNode.findall('./description')[0].text
2332                         info['title'] = videoNode.findall('./caption')[0].text
2333                         info['stitle'] = simplify_title(info['title'])
2334                         info['url'] = videoNode.findall('./file')[0].text
2335                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2336                         info['ext'] = info['url'].rpartition('.')[2]
2337                         info['format'] = info['ext']
2338                 except IndexError:
2339                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2340                         return
2341
2342                 return [info]
2343
2344
2345 class XVideosIE(InfoExtractor):
2346         """Information extractor for xvideos.com"""
2347
2348         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2349         IE_NAME = u'xvideos'
2350
2351         def report_webpage(self, video_id):
2352                 """Report information extraction."""
2353                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2354
2355         def report_extraction(self, video_id):
2356                 """Report information extraction."""
2357                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2358
2359         def _real_extract(self, url):
2360                 mobj = re.match(self._VALID_URL, url)
2361                 if mobj is None:
2362                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2363                         return
2364                 video_id = mobj.group(1).decode('utf-8')
2365
2366                 self.report_webpage(video_id)
2367
2368                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2369                 try:
2370                         webpage = urllib2.urlopen(request).read()
2371                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2372                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2373                         return
2374
2375                 self.report_extraction(video_id)
2376
2377
2378                 # Extract video URL
2379                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2380                 if mobj is None:
2381                         self._downloader.trouble(u'ERROR: unable to extract video url')
2382                         return
2383                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2384
2385
2386                 # Extract title
2387                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2388                 if mobj is None:
2389                         self._downloader.trouble(u'ERROR: unable to extract video title')
2390                         return
2391                 video_title = mobj.group(1).decode('utf-8')
2392
2393
2394                 # Extract video thumbnail
2395                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2396                 if mobj is None:
2397                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2398                         return
2399                 video_thumbnail = mobj.group(1).decode('utf-8')
2400
2401                 info = {
2402                         'id': video_id,
2403                         'url': video_url,
2404                         'uploader': None,
2405                         'upload_date': None,
2406                         'title': video_title,
2407                         'stitle': simplify_title(video_title),
2408                         'ext': 'flv',
2409                         'format': 'flv',
2410                         'thumbnail': video_thumbnail,
2411                         'description': None,
2412                         'player_url': None,
2413                 }
2414
2415                 return [info]
2416
2417
2418 class SoundcloudIE(InfoExtractor):
2419         """Information extractor for soundcloud.com
2420            To access the media, the uid of the song and a stream token
2421            must be extracted from the page source and the script must make
2422            a request to media.soundcloud.com/crossdomain.xml. Then
2423            the media can be grabbed by requesting from an url composed
2424            of the stream token and uid
2425          """
2426
2427         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2428         IE_NAME = u'soundcloud'
2429
2430         def __init__(self, downloader=None):
2431                 InfoExtractor.__init__(self, downloader)
2432
2433         def report_webpage(self, video_id):
2434                 """Report information extraction."""
2435                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2436
2437         def report_extraction(self, video_id):
2438                 """Report information extraction."""
2439                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2440
2441         def _real_extract(self, url):
2442                 mobj = re.match(self._VALID_URL, url)
2443                 if mobj is None:
2444                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2445                         return
2446
2447                 # extract uploader (which is in the url)
2448                 uploader = mobj.group(1).decode('utf-8')
2449                 # extract simple title (uploader + slug of song title)
2450                 slug_title =  mobj.group(2).decode('utf-8')
2451                 simple_title = uploader + '-' + slug_title
2452
2453                 self.report_webpage('%s/%s' % (uploader, slug_title))
2454
2455                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2456                 try:
2457                         webpage = urllib2.urlopen(request).read()
2458                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2460                         return
2461
2462                 self.report_extraction('%s/%s' % (uploader, slug_title))
2463
2464                 # extract uid and stream token that soundcloud hands out for access
2465                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2466                 if mobj:
2467                         video_id = mobj.group(1)
2468                         stream_token = mobj.group(2)
2469
2470                 # extract unsimplified title
2471                 mobj = re.search('"title":"(.*?)",', webpage)
2472                 if mobj:
2473                         title = mobj.group(1)
2474
2475                 # construct media url (with uid/token)
2476                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2477                 mediaURL = mediaURL % (video_id, stream_token)
2478
2479                 # description
2480                 description = u'No description available'
2481                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2482                 if mobj:
2483                         description = mobj.group(1)
2484
2485                 # upload date
2486                 upload_date = None
2487                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2488                 if mobj:
2489                         try:
2490                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2491                         except Exception, e:
2492                                 print str(e)
2493
2494                 # for soundcloud, a request to a cross domain is required for cookies
2495                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2496
2497                 return [{
2498                         'id':           video_id.decode('utf-8'),
2499                         'url':          mediaURL,
2500                         'uploader':     uploader.decode('utf-8'),
2501                         'upload_date':  upload_date,
2502                         'title':        simple_title.decode('utf-8'),
2503                         'stitle':       simple_title.decode('utf-8'),
2504                         'ext':          u'mp3',
2505                         'format':       u'NA',
2506                         'player_url':   None,
2507                         'description': description.decode('utf-8')
2508                 }]
2509
2510
2511 class InfoQIE(InfoExtractor):
2512         """Information extractor for infoq.com"""
2513
2514         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2515         IE_NAME = u'infoq'
2516
2517         def report_webpage(self, video_id):
2518                 """Report information extraction."""
2519                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2520
2521         def report_extraction(self, video_id):
2522                 """Report information extraction."""
2523                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2524
2525         def _real_extract(self, url):
2526                 mobj = re.match(self._VALID_URL, url)
2527                 if mobj is None:
2528                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2529                         return
2530
2531                 self.report_webpage(url)
2532
2533                 request = urllib2.Request(url)
2534                 try:
2535                         webpage = urllib2.urlopen(request).read()
2536                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2537                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2538                         return
2539
2540                 self.report_extraction(url)
2541
2542
2543                 # Extract video URL
2544                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2545                 if mobj is None:
2546                         self._downloader.trouble(u'ERROR: unable to extract video url')
2547                         return
2548                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2549
2550
2551                 # Extract title
2552                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2553                 if mobj is None:
2554                         self._downloader.trouble(u'ERROR: unable to extract video title')
2555                         return
2556                 video_title = mobj.group(1).decode('utf-8')
2557
2558                 # Extract description
2559                 video_description = u'No description available.'
2560                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2561                 if mobj is not None:
2562                         video_description = mobj.group(1).decode('utf-8')
2563
2564                 video_filename = video_url.split('/')[-1]
2565                 video_id, extension = video_filename.split('.')
2566
2567                 info = {
2568                         'id': video_id,
2569                         'url': video_url,
2570                         'uploader': None,
2571                         'upload_date': None,
2572                         'title': video_title,
2573                         'stitle': simplify_title(video_title),
2574                         'ext': extension,
2575                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2576                         'thumbnail': None,
2577                         'description': video_description,
2578                         'player_url': None,
2579                 }
2580
2581                 return [info]
2582
2583 class MixcloudIE(InfoExtractor):
2584         """Information extractor for www.mixcloud.com"""
2585         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2586         IE_NAME = u'mixcloud'
2587
2588         def __init__(self, downloader=None):
2589                 InfoExtractor.__init__(self, downloader)
2590
2591         def report_download_json(self, file_id):
2592                 """Report JSON download."""
2593                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2594
2595         def report_extraction(self, file_id):
2596                 """Report information extraction."""
2597                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2598
2599         def get_urls(self, jsonData, fmt, bitrate='best'):
2600                 """Get urls from 'audio_formats' section in json"""
2601                 file_url = None
2602                 try:
2603                         bitrate_list = jsonData[fmt]
2604                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2605                                 bitrate = max(bitrate_list) # select highest
2606
2607                         url_list = jsonData[fmt][bitrate]
2608                 except TypeError: # we have no bitrate info.
2609                         url_list = jsonData[fmt]
2610                 return url_list
2611
2612         def check_urls(self, url_list):
2613                 """Returns 1st active url from list"""
2614                 for url in url_list:
2615                         try:
2616                                 urllib2.urlopen(url)
2617                                 return url
2618                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2619                                 url = None
2620
2621                 return None
2622
2623         def _print_formats(self, formats):
2624                 print 'Available formats:'
2625                 for fmt in formats.keys():
2626                         for b in formats[fmt]:
2627                                 try:
2628                                         ext = formats[fmt][b][0]
2629                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2630                                 except TypeError: # we have no bitrate info
2631                                         ext = formats[fmt][0]
2632                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2633                                         break
2634
2635         def _real_extract(self, url):
2636                 mobj = re.match(self._VALID_URL, url)
2637                 if mobj is None:
2638                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639                         return
2640                 # extract uploader & filename from url
2641                 uploader = mobj.group(1).decode('utf-8')
2642                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2643
2644                 # construct API request
2645                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2646                 # retrieve .json file with links to files
2647                 request = urllib2.Request(file_url)
2648                 try:
2649                         self.report_download_json(file_url)
2650                         jsonData = urllib2.urlopen(request).read()
2651                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2652                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2653                         return
2654
2655                 # parse JSON
2656                 json_data = json.loads(jsonData)
2657                 player_url = json_data['player_swf_url']
2658                 formats = dict(json_data['audio_formats'])
2659
2660                 req_format = self._downloader.params.get('format', None)
2661                 bitrate = None
2662
2663                 if self._downloader.params.get('listformats', None):
2664                         self._print_formats(formats)
2665                         return
2666
2667                 if req_format is None or req_format == 'best':
2668                         for format_param in formats.keys():
2669                                 url_list = self.get_urls(formats, format_param)
2670                                 # check urls
2671                                 file_url = self.check_urls(url_list)
2672                                 if file_url is not None:
2673                                         break # got it!
2674                 else:
2675                         if req_format not in formats.keys():
2676                                 self._downloader.trouble(u'ERROR: format is not available')
2677                                 return
2678
2679                         url_list = self.get_urls(formats, req_format)
2680                         file_url = self.check_urls(url_list)
2681                         format_param = req_format
2682
2683                 return [{
2684                         'id': file_id.decode('utf-8'),
2685                         'url': file_url.decode('utf-8'),
2686                         'uploader':     uploader.decode('utf-8'),
2687                         'upload_date': u'NA',
2688                         'title': json_data['name'],
2689                         'stitle': simplify_title(json_data['name']),
2690                         'ext': file_url.split('.')[-1].decode('utf-8'),
2691                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2692                         'thumbnail': json_data['thumbnail_url'],
2693                         'description': json_data['description'],
2694                         'player_url': player_url.decode('utf-8'),
2695                 }]
2696
2697 class StanfordOpenClassroomIE(InfoExtractor):
2698         """Information extractor for Stanford's Open ClassRoom"""
2699
2700         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2701         IE_NAME = u'stanfordoc'
2702
2703         def report_download_webpage(self, objid):
2704                 """Report information extraction."""
2705                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2706
2707         def report_extraction(self, video_id):
2708                 """Report information extraction."""
2709                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2710
2711         def _real_extract(self, url):
2712                 mobj = re.match(self._VALID_URL, url)
2713                 if mobj is None:
2714                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2715                         return
2716
2717                 if mobj.group('course') and mobj.group('video'): # A specific video
2718                         course = mobj.group('course')
2719                         video = mobj.group('video')
2720                         info = {
2721                                 'id': simplify_title(course + '_' + video),
2722                         }
2723
2724                         self.report_extraction(info['id'])
2725                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2726                         xmlUrl = baseUrl + video + '.xml'
2727                         try:
2728                                 metaXml = urllib2.urlopen(xmlUrl).read()
2729                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2730                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2731                                 return
2732                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2733                         try:
2734                                 info['title'] = mdoc.findall('./title')[0].text
2735                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2736                         except IndexError:
2737                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2738                                 return
2739                         info['stitle'] = simplify_title(info['title'])
2740                         info['ext'] = info['url'].rpartition('.')[2]
2741                         info['format'] = info['ext']
2742                         return [info]
2743                 elif mobj.group('course'): # A course page
2744                         course = mobj.group('course')
2745                         info = {
2746                                 'id': simplify_title(course),
2747                                 'type': 'playlist',
2748                         }
2749
2750                         self.report_download_webpage(info['id'])
2751                         try:
2752                                 coursepage = urllib2.urlopen(url).read()
2753                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2755                                 return
2756
2757                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2758                         if m:
2759                                 info['title'] = unescapeHTML(m.group(1))
2760                         else:
2761                                 info['title'] = info['id']
2762                         info['stitle'] = simplify_title(info['title'])
2763
2764                         m = re.search('<description>([^<]+)</description>', coursepage)
2765                         if m:
2766                                 info['description'] = unescapeHTML(m.group(1))
2767
2768                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2769                         info['list'] = [
2770                                 {
2771                                         'type': 'reference',
2772                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2773                                 }
2774                                         for vpage in links]
2775                         results = []
2776                         for entry in info['list']:
2777                                 assert entry['type'] == 'reference'
2778                                 results += self.extract(entry['url'])
2779                         return results
2780
2781                 else: # Root page
2782                         info = {
2783                                 'id': 'Stanford OpenClassroom',
2784                                 'type': 'playlist',
2785                         }
2786
2787                         self.report_download_webpage(info['id'])
2788                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2789                         try:
2790                                 rootpage = urllib2.urlopen(rootURL).read()
2791                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2792                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2793                                 return
2794
2795                         info['title'] = info['id']
2796                         info['stitle'] = simplify_title(info['title'])
2797
2798                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2799                         info['list'] = [
2800                                 {
2801                                         'type': 'reference',
2802                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2803                                 }
2804                                         for cpage in links]
2805
2806                         results = []
2807                         for entry in info['list']:
2808                                 assert entry['type'] == 'reference'
2809                                 results += self.extract(entry['url'])
2810                         return results
2811
2812 class MTVIE(InfoExtractor):
2813         """Information extractor for MTV.com"""
2814
2815         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2816         IE_NAME = u'mtv'
2817
2818         def report_webpage(self, video_id):
2819                 """Report information extraction."""
2820                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2821
2822         def report_extraction(self, video_id):
2823                 """Report information extraction."""
2824                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2825
2826         def _real_extract(self, url):
2827                 mobj = re.match(self._VALID_URL, url)
2828                 if mobj is None:
2829                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2830                         return
2831                 if not mobj.group('proto'):
2832                         url = 'http://' + url
2833                 video_id = mobj.group('videoid')
2834                 self.report_webpage(video_id)
2835
2836                 request = urllib2.Request(url)
2837                 try:
2838                         webpage = urllib2.urlopen(request).read()
2839                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2840                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2841                         return
2842
2843                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2844                 if mobj is None:
2845                         self._downloader.trouble(u'ERROR: unable to extract song name')
2846                         return
2847                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2848                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2849                 if mobj is None:
2850                         self._downloader.trouble(u'ERROR: unable to extract performer')
2851                         return
2852                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2853                 video_title = performer + ' - ' + song_name
2854
2855                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2856                 if mobj is None:
2857                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2858                         return
2859                 mtvn_uri = mobj.group(1)
2860
2861                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2862                 if mobj is None:
2863                         self._downloader.trouble(u'ERROR: unable to extract content id')
2864                         return
2865                 content_id = mobj.group(1)
2866
2867                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2868                 self.report_extraction(video_id)
2869                 request = urllib2.Request(videogen_url)
2870                 try:
2871                         metadataXml = urllib2.urlopen(request).read()
2872                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2873                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2874                         return
2875
2876                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2877                 renditions = mdoc.findall('.//rendition')
2878
2879                 # For now, always pick the highest quality.
2880                 rendition = renditions[-1]
2881
2882                 try:
2883                         _,_,ext = rendition.attrib['type'].partition('/')
2884                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2885                         video_url = rendition.find('./src').text
2886                 except KeyError:
2887                         self._downloader.trouble('Invalid rendition field.')
2888                         return
2889
2890                 info = {
2891                         'id': video_id,
2892                         'url': video_url,
2893                         'uploader': performer,
2894                         'title': video_title,
2895                         'stitle': simplify_title(video_title),
2896                         'ext': ext,
2897                         'format': format,
2898                 }
2899
2900                 return [info]