_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 from urlparse import parse_qs
  17
  18 try:
  19         import cStringIO as StringIO
  20 except ImportError:
  21         import StringIO
  22
  23 from utils import *
  24
  25
  26 class InfoExtractor(object):
  27         """Information Extractor class.
  28
  29         Information extractors are the classes that, given a URL, extract
  30         information from the video (or videos) the URL refers to. This
  31         information includes the real video URL, the video title and simplified
  32         title, author and others. The information is stored in a dictionary
  33         which is then passed to the FileDownloader. The FileDownloader
  34         processes this information possibly downloading the video to the file
  35         system, among other possible outcomes. The dictionaries must include
  36         the following fields:
  37
  38         id:             Video identifier.
  39         url:            Final video URL.
  40         uploader:       Nickname of the video uploader.
  41         title:          Literal title.
  42         ext:            Video filename extension.
  43         format:         Video format.
  44         player_url:     SWF Player URL (may be None).
  45
  46         The following fields are optional. Their primary purpose is to allow
  47         youtube-dl to serve as the backend for a video search function, such
  48         as the one in youtube2mp3.  They are only used when their respective
  49         forced printing functions are called:
  50
  51         thumbnail:      Full URL to a video thumbnail image.
  52         description:    One-line video description.
  53
  54         Subclasses of this one should re-define the _real_initialize() and
  55         _real_extract() methods and define a _VALID_URL regexp.
  56         Probably, they should also be added to the list of extractors.
  57         """
  58
  59         _ready = False
  60         _downloader = None
  61
  62         def __init__(self, downloader=None):
  63                 """Constructor. Receives an optional downloader."""
  64                 self._ready = False
  65                 self.set_downloader(downloader)
  66
  67         def suitable(self, url):
  68                 """Receives a URL and returns True if suitable for this IE."""
  69                 return re.match(self._VALID_URL, url) is not None
  70
  71         def initialize(self):
  72                 """Initializes an instance (authentication, etc)."""
  73                 if not self._ready:
  74                         self._real_initialize()
  75                         self._ready = True
  76
  77         def extract(self, url):
  78                 """Extracts URL information and returns it in list of dicts."""
  79                 self.initialize()
  80                 return self._real_extract(url)
  81
  82         def set_downloader(self, downloader):
  83                 """Sets the downloader for this IE."""
  84                 self._downloader = downloader
  85
  86         def _real_initialize(self):
  87                 """Real initialization process. Redefine in subclasses."""
  88                 pass
  89
  90         def _real_extract(self, url):
  91                 """Real extraction process. Redefine in subclasses."""
  92                 pass
  93
  94
  95 class YoutubeIE(InfoExtractor):
  96         """Information extractor for youtube.com."""
  97
  98         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
  99         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 100         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 101         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 102         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 103         _NETRC_MACHINE = 'youtube'
 104         # Listed in order of quality
 105         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 106         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 107         _video_extensions = {
 108                 '13': '3gp',
 109                 '17': 'mp4',
 110                 '18': 'mp4',
 111                 '22': 'mp4',
 112                 '37': 'mp4',
 113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 114                 '43': 'webm',
 115                 '44': 'webm',
 116                 '45': 'webm',
 117                 '46': 'webm',
 118         }
 119         _video_dimensions = {
 120                 '5': '240x400',
 121                 '6': '???',
 122                 '13': '???',
 123                 '17': '144x176',
 124                 '18': '360x640',
 125                 '22': '720x1280',
 126                 '34': '360x640',
 127                 '35': '480x854',
 128                 '37': '1080x1920',
 129                 '38': '3072x4096',
 130                 '43': '360x640',
 131                 '44': '480x854',
 132                 '45': '720x1280',
 133                 '46': '1080x1920',
 134         }
 135         IE_NAME = u'youtube'
 136
 137         def report_lang(self):
 138                 """Report attempt to set language."""
 139                 self._downloader.to_screen(u'[youtube] Setting language')
 140
 141         def report_login(self):
 142                 """Report attempt to log in."""
 143                 self._downloader.to_screen(u'[youtube] Logging in')
 144
 145         def report_age_confirmation(self):
 146                 """Report attempt to confirm age."""
 147                 self._downloader.to_screen(u'[youtube] Confirming age')
 148
 149         def report_video_webpage_download(self, video_id):
 150                 """Report attempt to download video webpage."""
 151                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 152
 153         def report_video_info_webpage_download(self, video_id):
 154                 """Report attempt to download video info webpage."""
 155                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 156
 157         def report_video_subtitles_download(self, video_id):
 158                 """Report attempt to download video info webpage."""
 159                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 160
 161         def report_information_extraction(self, video_id):
 162                 """Report attempt to extract video information."""
 163                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 164
 165         def report_unavailable_format(self, video_id, format):
 166                 """Report extracted video URL."""
 167                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 168
 169         def report_rtmp_download(self):
 170                 """Indicate the download will use the RTMP protocol."""
 171                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 172
 173         def _closed_captions_xml_to_srt(self, xml_string):
 174                 srt = ''
 175                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 176                 # TODO parse xml instead of regex
 177                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 178                         if not dur: dur = '4'
 179                         start = float(start)
 180                         end = start + float(dur)
 181                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 182                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 183                         caption = unescapeHTML(caption)
 184                         caption = unescapeHTML(caption) # double cycle, intentional
 185                         srt += str(n+1) + '\n'
 186                         srt += start + ' --> ' + end + '\n'
 187                         srt += caption + '\n\n'
 188                 return srt
 189
 190         def _print_formats(self, formats):
 191                 print 'Available formats:'
 192                 for x in formats:
 193                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 194
 195         def _real_initialize(self):
 196                 if self._downloader is None:
 197                         return
 198
 199                 username = None
 200                 password = None
 201                 downloader_params = self._downloader.params
 202
 203                 # Attempt to use provided username and password or .netrc data
 204                 if downloader_params.get('username', None) is not None:
 205                         username = downloader_params['username']
 206                         password = downloader_params['password']
 207                 elif downloader_params.get('usenetrc', False):
 208                         try:
 209                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 210                                 if info is not None:
 211                                         username = info[0]
 212                                         password = info[2]
 213                                 else:
 214                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 215                         except (IOError, netrc.NetrcParseError), err:
 216                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 217                                 return
 218
 219                 # Set language
 220                 request = urllib2.Request(self._LANG_URL)
 221                 try:
 222                         self.report_lang()
 223                         urllib2.urlopen(request).read()
 224                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 225                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 226                         return
 227
 228                 # No authentication to be performed
 229                 if username is None:
 230                         return
 231
 232                 # Log in
 233                 login_form = {
 234                                 'current_form': 'loginForm',
 235                                 'next':         '/',
 236                                 'action_login': 'Log In',
 237                                 'username':     username,
 238                                 'password':     password,
 239                                 }
 240                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 241                 try:
 242                         self.report_login()
 243                         login_results = urllib2.urlopen(request).read()
 244                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 245                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 246                                 return
 247                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 248                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 249                         return
 250
 251                 # Confirm age
 252                 age_form = {
 253                                 'next_url':             '/',
 254                                 'action_confirm':       'Confirm',
 255                                 }
 256                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 257                 try:
 258                         self.report_age_confirmation()
 259                         age_results = urllib2.urlopen(request).read()
 260                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 261                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 262                         return
 263
 264         def _real_extract(self, url):
 265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 266                 mobj = re.search(self._NEXT_URL_RE, url)
 267                 if mobj:
 268                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 269
 270                 # Extract video id from URL
 271                 mobj = re.match(self._VALID_URL, url)
 272                 if mobj is None:
 273                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 274                         return
 275                 video_id = mobj.group(2)
 276
 277                 # Get video webpage
 278                 self.report_video_webpage_download(video_id)
 279                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 280                 try:
 281                         video_webpage = urllib2.urlopen(request).read()
 282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 283                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 284                         return
 285
 286                 # Attempt to extract SWF player URL
 287                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 288                 if mobj is not None:
 289                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 290                 else:
 291                         player_url = None
 292
 293                 # Get video info
 294                 self.report_video_info_webpage_download(video_id)
 295                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 296                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 297                                         % (video_id, el_type))
 298                         request = urllib2.Request(video_info_url)
 299                         try:
 300                                 video_info_webpage = urllib2.urlopen(request).read()
 301                                 video_info = parse_qs(video_info_webpage)
 302                                 if 'token' in video_info:
 303                                         break
 304                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 305                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 306                                 return
 307                 if 'token' not in video_info:
 308                         if 'reason' in video_info:
 309                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 310                         else:
 311                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 312                         return
 313
 314                 # Check for "rental" videos
 315                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 316                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 317                         return
 318
 319                 # Start extracting information
 320                 self.report_information_extraction(video_id)
 321
 322                 # uploader
 323                 if 'author' not in video_info:
 324                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 325                         return
 326                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 327
 328                 # title
 329                 if 'title' not in video_info:
 330                         self._downloader.trouble(u'ERROR: unable to extract video title')
 331                         return
 332                 video_title = urllib.unquote_plus(video_info['title'][0])
 333                 video_title = video_title.decode('utf-8')
 334
 335                 # thumbnail image
 336                 if 'thumbnail_url' not in video_info:
 337                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 338                         video_thumbnail = ''
 339                 else:   # don't panic if we can't find it
 340                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 341
 342                 # upload date
 343                 upload_date = u'NA'
 344                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 345                 if mobj is not None:
 346                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 347                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 348                         for expression in format_expressions:
 349                                 try:
 350                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 351                                 except:
 352                                         pass
 353
 354                 # description
 355                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 356                 if video_description: video_description = clean_html(video_description)
 357                 else: video_description = ''
 358
 359                 # closed captions
 360                 video_subtitles = None
 361                 if self._downloader.params.get('writesubtitles', False):
 362                         try:
 363                                 self.report_video_subtitles_download(video_id)
 364                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 365                                 try:
 366                                         srt_list = urllib2.urlopen(request).read()
 367                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 368                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 369                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 370                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 371                                 if not srt_lang_list:
 372                                         raise Trouble(u'WARNING: video has no closed captions')
 373                                 if self._downloader.params.get('subtitleslang', False):
 374                                         srt_lang = self._downloader.params.get('subtitleslang')
 375                                 elif 'en' in srt_lang_list:
 376                                         srt_lang = 'en'
 377                                 else:
 378                                         srt_lang = srt_lang_list.keys()[0]
 379                                 if not srt_lang in srt_lang_list:
 380                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 381                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 382                                 try:
 383                                         srt_xml = urllib2.urlopen(request).read()
 384                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 385                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 386                                 if not srt_xml:
 387                                         raise Trouble(u'WARNING: unable to download video subtitles')
 388                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 389                         except Trouble as trouble:
 390                                 self._downloader.trouble(trouble[0])
 391
 392                 # token
 393                 video_token = urllib.unquote_plus(video_info['token'][0])
 394
 395                 # Decide which formats to download
 396                 req_format = self._downloader.params.get('format', None)
 397
 398                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 399                         self.report_rtmp_download()
 400                         video_url_list = [(None, video_info['conn'][0])]
 401                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 402                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 403                         url_data = [parse_qs(uds) for uds in url_data_strs]
 404                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 405                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
 406
 407                         format_limit = self._downloader.params.get('format_limit', None)
 408                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 409                         if format_limit is not None and format_limit in available_formats:
 410                                 format_list = available_formats[available_formats.index(format_limit):]
 411                         else:
 412                                 format_list = available_formats
 413                         existing_formats = [x for x in format_list if x in url_map]
 414                         if len(existing_formats) == 0:
 415                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 416                                 return
 417                         if self._downloader.params.get('listformats', None):
 418                                 self._print_formats(existing_formats)
 419                                 return
 420                         if req_format is None or req_format == 'best':
 421                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 422                         elif req_format == 'worst':
 423                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 424                         elif req_format in ('-1', 'all'):
 425                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 426                         else:
 427                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 428                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 429                                 req_formats = req_format.split('/')
 430                                 video_url_list = None
 431                                 for rf in req_formats:
 432                                         if rf in url_map:
 433                                                 video_url_list = [(rf, url_map[rf])]
 434                                                 break
 435                                 if video_url_list is None:
 436                                         self._downloader.trouble(u'ERROR: requested format not available')
 437                                         return
 438                 else:
 439                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 440                         return
 441
 442                 results = []
 443                 for format_param, video_real_url in video_url_list:
 444                         # Extension
 445                         video_extension = self._video_extensions.get(format_param, 'flv')
 446
 447                         results.append({
 448                                 'provider': IE_NAME,
 449                                 'id':           video_id.decode('utf-8'),
 450                                 'url':          video_real_url.decode('utf-8'),
 451                                 'uploader':     video_uploader.decode('utf-8'),
 452                                 'upload_date':  upload_date,
 453                                 'title':        video_title,
 454                                 'ext':          video_extension.decode('utf-8'),
 455                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 456                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 457                                 'description':  video_description,
 458                                 'player_url':   player_url,
 459                                 'subtitles':    video_subtitles
 460                         })
 461                 return results
 462
 463
 464 class MetacafeIE(InfoExtractor):
 465         """Information Extractor for metacafe.com."""
 466
 467         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 468         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 469         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 470         IE_NAME = u'metacafe'
 471
 472         def __init__(self, downloader=None):
 473                 InfoExtractor.__init__(self, downloader)
 474
 475         def report_disclaimer(self):
 476                 """Report disclaimer retrieval."""
 477                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 478
 479         def report_age_confirmation(self):
 480                 """Report attempt to confirm age."""
 481                 self._downloader.to_screen(u'[metacafe] Confirming age')
 482
 483         def report_download_webpage(self, video_id):
 484                 """Report webpage download."""
 485                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 486
 487         def report_extraction(self, video_id):
 488                 """Report information extraction."""
 489                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 490
 491         def _real_initialize(self):
 492                 # Retrieve disclaimer
 493                 request = urllib2.Request(self._DISCLAIMER)
 494                 try:
 495                         self.report_disclaimer()
 496                         disclaimer = urllib2.urlopen(request).read()
 497                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 498                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 499                         return
 500
 501                 # Confirm age
 502                 disclaimer_form = {
 503                         'filters': '0',
 504                         'submit': "Continue - I'm over 18",
 505                         }
 506                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 507                 try:
 508                         self.report_age_confirmation()
 509                         disclaimer = urllib2.urlopen(request).read()
 510                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 511                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 512                         return
 513
 514         def _real_extract(self, url):
 515                 # Extract id and simplified title from URL
 516                 mobj = re.match(self._VALID_URL, url)
 517                 if mobj is None:
 518                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 519                         return
 520
 521                 video_id = mobj.group(1)
 522
 523                 # Check if video comes from YouTube
 524                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 525                 if mobj2 is not None:
 526                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 527                         return
 528
 529                 # Retrieve video webpage to extract further information
 530                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 531                 try:
 532                         self.report_download_webpage(video_id)
 533                         webpage = urllib2.urlopen(request).read()
 534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 535                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 536                         return
 537
 538                 # Extract URL, uploader and title from webpage
 539                 self.report_extraction(video_id)
 540                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 541                 if mobj is not None:
 542                         mediaURL = urllib.unquote(mobj.group(1))
 543                         video_extension = mediaURL[-3:]
 544
 545                         # Extract gdaKey if available
 546                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 547                         if mobj is None:
 548                                 video_url = mediaURL
 549                         else:
 550                                 gdaKey = mobj.group(1)
 551                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 552                 else:
 553                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 554                         if mobj is None:
 555                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 556                                 return
 557                         vardict = parse_qs(mobj.group(1))
 558                         if 'mediaData' not in vardict:
 559                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 560                                 return
 561                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 562                         if mobj is None:
 563                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 564                                 return
 565                         mediaURL = mobj.group(1).replace('\\/', '/')
 566                         video_extension = mediaURL[-3:]
 567                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 568
 569                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 570                 if mobj is None:
 571                         self._downloader.trouble(u'ERROR: unable to extract title')
 572                         return
 573                 video_title = mobj.group(1).decode('utf-8')
 574
 575                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 576                 if mobj is None:
 577                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 578                         return
 579                 video_uploader = mobj.group(1)
 580
 581                 return [{
 582                         'provider': IE_NAME,
 583                         'id':           video_id.decode('utf-8'),
 584                         'url':          video_url.decode('utf-8'),
 585                         'uploader':     video_uploader.decode('utf-8'),
 586                         'upload_date':  u'NA',
 587                         'title':        video_title,
 588                         'ext':          video_extension.decode('utf-8'),
 589                         'format':       u'NA',
 590                         'player_url':   None,
 591                 }]
 592
 593
 594 class DailymotionIE(InfoExtractor):
 595         """Information Extractor for Dailymotion"""
 596
 597         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 598         IE_NAME = u'dailymotion'
 599
 600         def __init__(self, downloader=None):
 601                 InfoExtractor.__init__(self, downloader)
 602
 603         def report_download_webpage(self, video_id):
 604                 """Report webpage download."""
 605                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 606
 607         def report_extraction(self, video_id):
 608                 """Report information extraction."""
 609                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 610
 611         def _real_extract(self, url):
 612                 # Extract id and simplified title from URL
 613                 mobj = re.match(self._VALID_URL, url)
 614                 if mobj is None:
 615                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 616                         return
 617
 618                 video_id = mobj.group(1)
 619
 620                 video_extension = 'flv'
 621
 622                 # Retrieve video webpage to extract further information
 623                 request = urllib2.Request(url)
 624                 request.add_header('Cookie', 'family_filter=off')
 625                 try:
 626                         self.report_download_webpage(video_id)
 627                         webpage = urllib2.urlopen(request).read()
 628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 629                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 630                         return
 631
 632                 # Extract URL, uploader and title from webpage
 633                 self.report_extraction(video_id)
 634                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 635                 if mobj is None:
 636                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                         return
 638                 sequence = urllib.unquote(mobj.group(1))
 639                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 640                 if mobj is None:
 641                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                         return
 643                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 644
 645                 # if needed add http://www.dailymotion.com/ if relative URL
 646
 647                 video_url = mediaURL
 648
 649                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 650                 if mobj is None:
 651                         self._downloader.trouble(u'ERROR: unable to extract title')
 652                         return
 653                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 654
 655                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 656                 if mobj is None:
 657                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 658                         return
 659                 video_uploader = mobj.group(1)
 660
 661                 return [{
 662                         'provider': IE_NAME,
 663                         'id':           video_id.decode('utf-8'),
 664                         'url':          video_url.decode('utf-8'),
 665                         'uploader':     video_uploader.decode('utf-8'),
 666                         'upload_date':  u'NA',
 667                         'title':        video_title,
 668                         'ext':          video_extension.decode('utf-8'),
 669                         'format':       u'NA',
 670                         'player_url':   None,
 671                 }]
 672
 673
 674 class GoogleIE(InfoExtractor):
 675         """Information extractor for video.google.com."""
 676
 677         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 678         IE_NAME = u'video.google'
 679
 680         def __init__(self, downloader=None):
 681                 InfoExtractor.__init__(self, downloader)
 682
 683         def report_download_webpage(self, video_id):
 684                 """Report webpage download."""
 685                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 686
 687         def report_extraction(self, video_id):
 688                 """Report information extraction."""
 689                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 690
 691         def _real_extract(self, url):
 692                 # Extract id from URL
 693                 mobj = re.match(self._VALID_URL, url)
 694                 if mobj is None:
 695                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 696                         return
 697
 698                 video_id = mobj.group(1)
 699
 700                 video_extension = 'mp4'
 701
 702                 # Retrieve video webpage to extract further information
 703                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 704                 try:
 705                         self.report_download_webpage(video_id)
 706                         webpage = urllib2.urlopen(request).read()
 707                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 708                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 709                         return
 710
 711                 # Extract URL, uploader, and title from webpage
 712                 self.report_extraction(video_id)
 713                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 714                 if mobj is None:
 715                         video_extension = 'flv'
 716                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 717                 if mobj is None:
 718                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 719                         return
 720                 mediaURL = urllib.unquote(mobj.group(1))
 721                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 722                 mediaURL = mediaURL.replace('\\x26', '\x26')
 723
 724                 video_url = mediaURL
 725
 726                 mobj = re.search(r'<title>(.*)</title>', webpage)
 727                 if mobj is None:
 728                         self._downloader.trouble(u'ERROR: unable to extract title')
 729                         return
 730                 video_title = mobj.group(1).decode('utf-8')
 731
 732                 # Extract video description
 733                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 734                 if mobj is None:
 735                         self._downloader.trouble(u'ERROR: unable to extract video description')
 736                         return
 737                 video_description = mobj.group(1).decode('utf-8')
 738                 if not video_description:
 739                         video_description = 'No description available.'
 740
 741                 # Extract video thumbnail
 742                 if self._downloader.params.get('forcethumbnail', False):
 743                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 744                         try:
 745                                 webpage = urllib2.urlopen(request).read()
 746                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 747                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 748                                 return
 749                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 750                         if mobj is None:
 751                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 752                                 return
 753                         video_thumbnail = mobj.group(1)
 754                 else:   # we need something to pass to process_info
 755                         video_thumbnail = ''
 756
 757                 return [{
 758                         'provider': IE_NAME,
 759                         'id':           video_id.decode('utf-8'),
 760                         'url':          video_url.decode('utf-8'),
 761                         'uploader':     u'NA',
 762                         'upload_date':  u'NA',
 763                         'title':        video_title,
 764                         'ext':          video_extension.decode('utf-8'),
 765                         'format':       u'NA',
 766                         'player_url':   None,
 767                 }]
 768
 769
 770 class PhotobucketIE(InfoExtractor):
 771         """Information extractor for photobucket.com."""
 772
 773         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 774         IE_NAME = u'photobucket'
 775
 776         def __init__(self, downloader=None):
 777                 InfoExtractor.__init__(self, downloader)
 778
 779         def report_download_webpage(self, video_id):
 780                 """Report webpage download."""
 781                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 782
 783         def report_extraction(self, video_id):
 784                 """Report information extraction."""
 785                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 786
 787         def _real_extract(self, url):
 788                 # Extract id from URL
 789                 mobj = re.match(self._VALID_URL, url)
 790                 if mobj is None:
 791                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 792                         return
 793
 794                 video_id = mobj.group(1)
 795
 796                 video_extension = 'flv'
 797
 798                 # Retrieve video webpage to extract further information
 799                 request = urllib2.Request(url)
 800                 try:
 801                         self.report_download_webpage(video_id)
 802                         webpage = urllib2.urlopen(request).read()
 803                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 804                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 805                         return
 806
 807                 # Extract URL, uploader, and title from webpage
 808                 self.report_extraction(video_id)
 809                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 810                 if mobj is None:
 811                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 812                         return
 813                 mediaURL = urllib.unquote(mobj.group(1))
 814
 815                 video_url = mediaURL
 816
 817                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 818                 if mobj is None:
 819                         self._downloader.trouble(u'ERROR: unable to extract title')
 820                         return
 821                 video_title = mobj.group(1).decode('utf-8')
 822
 823                 video_uploader = mobj.group(2).decode('utf-8')
 824
 825                 return [{
 826                         'provider': IE_NAME,
 827                         'id':           video_id.decode('utf-8'),
 828                         'url':          video_url.decode('utf-8'),
 829                         'uploader':     video_uploader,
 830                         'upload_date':  u'NA',
 831                         'title':        video_title,
 832                         'ext':          video_extension.decode('utf-8'),
 833                         'format':       u'NA',
 834                         'player_url':   None,
 835                 }]
 836
 837
 838 class YahooIE(InfoExtractor):
 839         """Information extractor for video.yahoo.com."""
 840
 841         # _VALID_URL matches all Yahoo! Video URLs
 842         # _VPAGE_URL matches only the extractable '/watch/' URLs
 843         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 844         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 845         IE_NAME = u'video.yahoo'
 846
 847         def __init__(self, downloader=None):
 848                 InfoExtractor.__init__(self, downloader)
 849
 850         def report_download_webpage(self, video_id):
 851                 """Report webpage download."""
 852                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 853
 854         def report_extraction(self, video_id):
 855                 """Report information extraction."""
 856                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 857
 858         def _real_extract(self, url, new_video=True):
 859                 # Extract ID from URL
 860                 mobj = re.match(self._VALID_URL, url)
 861                 if mobj is None:
 862                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 863                         return
 864
 865                 video_id = mobj.group(2)
 866                 video_extension = 'flv'
 867
 868                 # Rewrite valid but non-extractable URLs as
 869                 # extractable English language /watch/ URLs
 870                 if re.match(self._VPAGE_URL, url) is None:
 871                         request = urllib2.Request(url)
 872                         try:
 873                                 webpage = urllib2.urlopen(request).read()
 874                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 875                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 876                                 return
 877
 878                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 879                         if mobj is None:
 880                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 881                                 return
 882                         yahoo_id = mobj.group(1)
 883
 884                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 885                         if mobj is None:
 886                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 887                                 return
 888                         yahoo_vid = mobj.group(1)
 889
 890                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 891                         return self._real_extract(url, new_video=False)
 892
 893                 # Retrieve video webpage to extract further information
 894                 request = urllib2.Request(url)
 895                 try:
 896                         self.report_download_webpage(video_id)
 897                         webpage = urllib2.urlopen(request).read()
 898                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 899                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 900                         return
 901
 902                 # Extract uploader and title from webpage
 903                 self.report_extraction(video_id)
 904                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 905                 if mobj is None:
 906                         self._downloader.trouble(u'ERROR: unable to extract video title')
 907                         return
 908                 video_title = mobj.group(1).decode('utf-8')
 909
 910                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 911                 if mobj is None:
 912                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 913                         return
 914                 video_uploader = mobj.group(1).decode('utf-8')
 915
 916                 # Extract video thumbnail
 917                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 918                 if mobj is None:
 919                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 920                         return
 921                 video_thumbnail = mobj.group(1).decode('utf-8')
 922
 923                 # Extract video description
 924                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 925                 if mobj is None:
 926                         self._downloader.trouble(u'ERROR: unable to extract video description')
 927                         return
 928                 video_description = mobj.group(1).decode('utf-8')
 929                 if not video_description:
 930                         video_description = 'No description available.'
 931
 932                 # Extract video height and width
 933                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 934                 if mobj is None:
 935                         self._downloader.trouble(u'ERROR: unable to extract video height')
 936                         return
 937                 yv_video_height = mobj.group(1)
 938
 939                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: unable to extract video width')
 942                         return
 943                 yv_video_width = mobj.group(1)
 944
 945                 # Retrieve video playlist to extract media URL
 946                 # I'm not completely sure what all these options are, but we
 947                 # seem to need most of them, otherwise the server sends a 401.
 948                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 949                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 950                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 951                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 952                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 953                 try:
 954                         self.report_download_webpage(video_id)
 955                         webpage = urllib2.urlopen(request).read()
 956                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 957                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 958                         return
 959
 960                 # Extract media URL from playlist XML
 961                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 962                 if mobj is None:
 963                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 964                         return
 965                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 966                 video_url = unescapeHTML(video_url)
 967
 968                 return [{
 969                         'provider': IE_NAME,
 970                         'id':           video_id.decode('utf-8'),
 971                         'url':          video_url,
 972                         'uploader':     video_uploader,
 973                         'upload_date':  u'NA',
 974                         'title':        video_title,
 975                         'ext':          video_extension.decode('utf-8'),
 976                         'thumbnail':    video_thumbnail.decode('utf-8'),
 977                         'description':  video_description,
 978                         'thumbnail':    video_thumbnail,
 979                         'player_url':   None,
 980                 }]
 981
 982
 983 class VimeoIE(InfoExtractor):
 984         """Information extractor for vimeo.com."""
 985
 986         # _VALID_URL matches Vimeo URLs
 987         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 988         IE_NAME = u'vimeo'
 989
 990         def __init__(self, downloader=None):
 991                 InfoExtractor.__init__(self, downloader)
 992
 993         def report_download_webpage(self, video_id):
 994                 """Report webpage download."""
 995                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 996
 997         def report_extraction(self, video_id):
 998                 """Report information extraction."""
 999                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1000
1001         def _real_extract(self, url, new_video=True):
1002                 # Extract ID from URL
1003                 mobj = re.match(self._VALID_URL, url)
1004                 if mobj is None:
1005                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1006                         return
1007
1008                 video_id = mobj.group(1)
1009
1010                 # Retrieve video webpage to extract further information
1011                 request = urllib2.Request(url, None, std_headers)
1012                 try:
1013                         self.report_download_webpage(video_id)
1014                         webpage = urllib2.urlopen(request).read()
1015                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1016                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1017                         return
1018
1019                 # Now we begin extracting as much information as we can from what we
1020                 # retrieved. First we extract the information common to all extractors,
1021                 # and latter we extract those that are Vimeo specific.
1022                 self.report_extraction(video_id)
1023
1024                 # Extract the config JSON
1025                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1026                 try:
1027                         config = json.loads(config)
1028                 except:
1029                         self._downloader.trouble(u'ERROR: unable to extract info section')
1030                         return
1031
1032                 # Extract title
1033                 video_title = config["video"]["title"]
1034
1035                 # Extract uploader
1036                 video_uploader = config["video"]["owner"]["name"]
1037
1038                 # Extract video thumbnail
1039                 video_thumbnail = config["video"]["thumbnail"]
1040
1041                 # Extract video description
1042                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1043                 if video_description: video_description = clean_html(video_description)
1044                 else: video_description = ''
1045
1046                 # Extract upload date
1047                 video_upload_date = u'NA'
1048                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1049                 if mobj is not None:
1050                         video_upload_date = mobj.group(1)
1051
1052                 # Vimeo specific: extract request signature and timestamp
1053                 sig = config['request']['signature']
1054                 timestamp = config['request']['timestamp']
1055
1056                 # Vimeo specific: extract video codec and quality information
1057                 # TODO bind to format param
1058                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1059                 for codec in codecs:
1060                         if codec[0] in config["video"]["files"]:
1061                                 video_codec = codec[0]
1062                                 video_extension = codec[1]
1063                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1064                                 else: quality = 'sd'
1065                                 break
1066                 else:
1067                         self._downloader.trouble(u'ERROR: no known codec found')
1068                         return
1069
1070                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1071                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1072
1073                 return [{
1074                         'provider': IE_NAME,
1075                         'id':           video_id,
1076                         'url':          video_url,
1077                         'uploader':     video_uploader,
1078                         'upload_date':  video_upload_date,
1079                         'title':        video_title,
1080                         'ext':          video_extension,
1081                         'thumbnail':    video_thumbnail,
1082                         'description':  video_description,
1083                         'player_url':   None,
1084                 }]
1085
1086
1087 class GenericIE(InfoExtractor):
1088         """Generic last-resort information extractor."""
1089
1090         _VALID_URL = r'.*'
1091         IE_NAME = u'generic'
1092
1093         def __init__(self, downloader=None):
1094                 InfoExtractor.__init__(self, downloader)
1095
1096         def report_download_webpage(self, video_id):
1097                 """Report webpage download."""
1098                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1099                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1100
1101         def report_extraction(self, video_id):
1102                 """Report information extraction."""
1103                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1104
1105         def report_following_redirect(self, new_url):
1106                 """Report information extraction."""
1107                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1108
1109         def _test_redirect(self, url):
1110                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1111                 class HeadRequest(urllib2.Request):
1112                         def get_method(self):
1113                                 return "HEAD"
1114
1115                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1116                         """
1117                         Subclass the HTTPRedirectHandler to make it use our
1118                         HeadRequest also on the redirected URL
1119                         """
1120                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1121                                 if code in (301, 302, 303, 307):
1122                                         newurl = newurl.replace(' ', '%20')
1123                                         newheaders = dict((k,v) for k,v in req.headers.items()
1124                                                                           if k.lower() not in ("content-length", "content-type"))
1125                                         return HeadRequest(newurl,
1126                                                                            headers=newheaders,
1127                                                                            origin_req_host=req.get_origin_req_host(),
1128                                                                            unverifiable=True)
1129                                 else:
1130                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1131
1132                 class HTTPMethodFallback(urllib2.BaseHandler):
1133                         """
1134                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1135                         """
1136                         def http_error_405(self, req, fp, code, msg, headers):
1137                                 fp.read()
1138                                 fp.close()
1139
1140                                 newheaders = dict((k,v) for k,v in req.headers.items()
1141                                                                   if k.lower() not in ("content-length", "content-type"))
1142                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1143                                                                                                  headers=newheaders,
1144                                                                                                  origin_req_host=req.get_origin_req_host(),
1145                                                                                                  unverifiable=True))
1146
1147                 # Build our opener
1148                 opener = urllib2.OpenerDirector()
1149                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1150                                                 HTTPMethodFallback, HEADRedirectHandler,
1151                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1152                         opener.add_handler(handler())
1153
1154                 response = opener.open(HeadRequest(url))
1155                 new_url = response.geturl()
1156
1157                 if url == new_url: return False
1158
1159                 self.report_following_redirect(new_url)
1160                 self._downloader.download([new_url])
1161                 return True
1162
1163         def _real_extract(self, url):
1164                 if self._test_redirect(url): return
1165
1166                 video_id = url.split('/')[-1]
1167                 request = urllib2.Request(url)
1168                 try:
1169                         self.report_download_webpage(video_id)
1170                         webpage = urllib2.urlopen(request).read()
1171                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1172                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1173                         return
1174                 except ValueError, err:
1175                         # since this is the last-resort InfoExtractor, if
1176                         # this error is thrown, it'll be thrown here
1177                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178                         return
1179
1180                 self.report_extraction(video_id)
1181                 # Start with something easy: JW Player in SWFObject
1182                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1183                 if mobj is None:
1184                         # Broaden the search a little bit
1185                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1186                 if mobj is None:
1187                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188                         return
1189
1190                 # It's possible that one of the regexes
1191                 # matched, but returned an empty group:
1192                 if mobj.group(1) is None:
1193                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1194                         return
1195
1196                 video_url = urllib.unquote(mobj.group(1))
1197                 video_id = os.path.basename(video_url)
1198
1199                 # here's a fun little line of code for you:
1200                 video_extension = os.path.splitext(video_id)[1][1:]
1201                 video_id = os.path.splitext(video_id)[0]
1202
1203                 # it's tempting to parse this further, but you would
1204                 # have to take into account all the variations like
1205                 #   Video Title - Site Name
1206                 #   Site Name | Video Title
1207                 #   Video Title - Tagline | Site Name
1208                 # and so on and so forth; it's just not practical
1209                 mobj = re.search(r'<title>(.*)</title>', webpage)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: unable to extract title')
1212                         return
1213                 video_title = mobj.group(1).decode('utf-8')
1214
1215                 # video uploader is domain name
1216                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1217                 if mobj is None:
1218                         self._downloader.trouble(u'ERROR: unable to extract title')
1219                         return
1220                 video_uploader = mobj.group(1).decode('utf-8')
1221
1222                 return [{
1223                         'provider': IE_NAME,
1224                         'id':           video_id.decode('utf-8'),
1225                         'url':          video_url.decode('utf-8'),
1226                         'uploader':     video_uploader,
1227                         'upload_date':  u'NA',
1228                         'title':        video_title,
1229                         'ext':          video_extension.decode('utf-8'),
1230                         'format':       u'NA',
1231                         'player_url':   None,
1232                 }]
1233
1234
1235 class YoutubeSearchIE(InfoExtractor):
1236         """Information Extractor for YouTube search queries."""
1237         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1238         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1239         _max_youtube_results = 1000
1240         IE_NAME = u'youtube:search'
1241
1242         def __init__(self, downloader=None):
1243                 InfoExtractor.__init__(self, downloader)
1244
1245         def report_download_page(self, query, pagenum):
1246                 """Report attempt to download search page with given number."""
1247                 query = query.decode(preferredencoding())
1248                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1249
1250         def _real_extract(self, query):
1251                 mobj = re.match(self._VALID_URL, query)
1252                 if mobj is None:
1253                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1254                         return
1255
1256                 prefix, query = query.split(':')
1257                 prefix = prefix[8:]
1258                 query = query.encode('utf-8')
1259                 if prefix == '':
1260                         self._download_n_results(query, 1)
1261                         return
1262                 elif prefix == 'all':
1263                         self._download_n_results(query, self._max_youtube_results)
1264                         return
1265                 else:
1266                         try:
1267                                 n = long(prefix)
1268                                 if n <= 0:
1269                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1270                                         return
1271                                 elif n > self._max_youtube_results:
1272                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1273                                         n = self._max_youtube_results
1274                                 self._download_n_results(query, n)
1275                                 return
1276                         except ValueError: # parsing prefix as integer fails
1277                                 self._download_n_results(query, 1)
1278                                 return
1279
1280         def _download_n_results(self, query, n):
1281                 """Downloads a specified number of results for a query"""
1282
1283                 video_ids = []
1284                 pagenum = 0
1285                 limit = n
1286
1287                 while (50 * pagenum) < limit:
1288                         self.report_download_page(query, pagenum+1)
1289                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1290                         request = urllib2.Request(result_url)
1291                         try:
1292                                 data = urllib2.urlopen(request).read()
1293                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1294                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1295                                 return
1296                         api_response = json.loads(data)['data']
1297
1298                         new_ids = list(video['id'] for video in api_response['items'])
1299                         video_ids += new_ids
1300
1301                         limit = min(n, api_response['totalItems'])
1302                         pagenum += 1
1303
1304                 if len(video_ids) > n:
1305                         video_ids = video_ids[:n]
1306                 for id in video_ids:
1307                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1308                 return
1309
1310
1311 class GoogleSearchIE(InfoExtractor):
1312         """Information Extractor for Google Video search queries."""
1313         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1314         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1315         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1316         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1317         _max_google_results = 1000
1318         IE_NAME = u'video.google:search'
1319
1320         def __init__(self, downloader=None):
1321                 InfoExtractor.__init__(self, downloader)
1322
1323         def report_download_page(self, query, pagenum):
1324                 """Report attempt to download playlist page with given number."""
1325                 query = query.decode(preferredencoding())
1326                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1327
1328         def _real_extract(self, query):
1329                 mobj = re.match(self._VALID_URL, query)
1330                 if mobj is None:
1331                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1332                         return
1333
1334                 prefix, query = query.split(':')
1335                 prefix = prefix[8:]
1336                 query = query.encode('utf-8')
1337                 if prefix == '':
1338                         self._download_n_results(query, 1)
1339                         return
1340                 elif prefix == 'all':
1341                         self._download_n_results(query, self._max_google_results)
1342                         return
1343                 else:
1344                         try:
1345                                 n = long(prefix)
1346                                 if n <= 0:
1347                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1348                                         return
1349                                 elif n > self._max_google_results:
1350                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1351                                         n = self._max_google_results
1352                                 self._download_n_results(query, n)
1353                                 return
1354                         except ValueError: # parsing prefix as integer fails
1355                                 self._download_n_results(query, 1)
1356                                 return
1357
1358         def _download_n_results(self, query, n):
1359                 """Downloads a specified number of results for a query"""
1360
1361                 video_ids = []
1362                 pagenum = 0
1363
1364                 while True:
1365                         self.report_download_page(query, pagenum)
1366                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1367                         request = urllib2.Request(result_url)
1368                         try:
1369                                 page = urllib2.urlopen(request).read()
1370                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1371                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1372                                 return
1373
1374                         # Extract video identifiers
1375                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1376                                 video_id = mobj.group(1)
1377                                 if video_id not in video_ids:
1378                                         video_ids.append(video_id)
1379                                         if len(video_ids) == n:
1380                                                 # Specified n videos reached
1381                                                 for id in video_ids:
1382                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1383                                                 return
1384
1385                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1386                                 for id in video_ids:
1387                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1388                                 return
1389
1390                         pagenum = pagenum + 1
1391
1392
1393 class YahooSearchIE(InfoExtractor):
1394         """Information Extractor for Yahoo! Video search queries."""
1395         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1396         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1397         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1398         _MORE_PAGES_INDICATOR = r'\s*Next'
1399         _max_yahoo_results = 1000
1400         IE_NAME = u'video.yahoo:search'
1401
1402         def __init__(self, downloader=None):
1403                 InfoExtractor.__init__(self, downloader)
1404
1405         def report_download_page(self, query, pagenum):
1406                 """Report attempt to download playlist page with given number."""
1407                 query = query.decode(preferredencoding())
1408                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1409
1410         def _real_extract(self, query):
1411                 mobj = re.match(self._VALID_URL, query)
1412                 if mobj is None:
1413                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1414                         return
1415
1416                 prefix, query = query.split(':')
1417                 prefix = prefix[8:]
1418                 query = query.encode('utf-8')
1419                 if prefix == '':
1420                         self._download_n_results(query, 1)
1421                         return
1422                 elif prefix == 'all':
1423                         self._download_n_results(query, self._max_yahoo_results)
1424                         return
1425                 else:
1426                         try:
1427                                 n = long(prefix)
1428                                 if n <= 0:
1429                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1430                                         return
1431                                 elif n > self._max_yahoo_results:
1432                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1433                                         n = self._max_yahoo_results
1434                                 self._download_n_results(query, n)
1435                                 return
1436                         except ValueError: # parsing prefix as integer fails
1437                                 self._download_n_results(query, 1)
1438                                 return
1439
1440         def _download_n_results(self, query, n):
1441                 """Downloads a specified number of results for a query"""
1442
1443                 video_ids = []
1444                 already_seen = set()
1445                 pagenum = 1
1446
1447                 while True:
1448                         self.report_download_page(query, pagenum)
1449                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1450                         request = urllib2.Request(result_url)
1451                         try:
1452                                 page = urllib2.urlopen(request).read()
1453                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1455                                 return
1456
1457                         # Extract video identifiers
1458                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1459                                 video_id = mobj.group(1)
1460                                 if video_id not in already_seen:
1461                                         video_ids.append(video_id)
1462                                         already_seen.add(video_id)
1463                                         if len(video_ids) == n:
1464                                                 # Specified n videos reached
1465                                                 for id in video_ids:
1466                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1467                                                 return
1468
1469                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1470                                 for id in video_ids:
1471                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1472                                 return
1473
1474                         pagenum = pagenum + 1
1475
1476
1477 class YoutubePlaylistIE(InfoExtractor):
1478         """Information Extractor for YouTube playlists."""
1479
1480         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1481         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1482         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1483         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1484         IE_NAME = u'youtube:playlist'
1485
1486         def __init__(self, downloader=None):
1487                 InfoExtractor.__init__(self, downloader)
1488
1489         def report_download_page(self, playlist_id, pagenum):
1490                 """Report attempt to download playlist page with given number."""
1491                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1492
1493         def _real_extract(self, url):
1494                 # Extract playlist id
1495                 mobj = re.match(self._VALID_URL, url)
1496                 if mobj is None:
1497                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1498                         return
1499
1500                 # Single video case
1501                 if mobj.group(3) is not None:
1502                         self._downloader.download([mobj.group(3)])
1503                         return
1504
1505                 # Download playlist pages
1506                 # prefix is 'p' as default for playlists but there are other types that need extra care
1507                 playlist_prefix = mobj.group(1)
1508                 if playlist_prefix == 'a':
1509                         playlist_access = 'artist'
1510                 else:
1511                         playlist_prefix = 'p'
1512                         playlist_access = 'view_play_list'
1513                 playlist_id = mobj.group(2)
1514                 video_ids = []
1515                 pagenum = 1
1516
1517                 while True:
1518                         self.report_download_page(playlist_id, pagenum)
1519                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1520                         request = urllib2.Request(url)
1521                         try:
1522                                 page = urllib2.urlopen(request).read()
1523                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1524                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1525                                 return
1526
1527                         # Extract video identifiers
1528                         ids_in_page = []
1529                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1530                                 if mobj.group(1) not in ids_in_page:
1531                                         ids_in_page.append(mobj.group(1))
1532                         video_ids.extend(ids_in_page)
1533
1534                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1535                                 break
1536                         pagenum = pagenum + 1
1537
1538                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1539                 playlistend = self._downloader.params.get('playlistend', -1)
1540                 if playlistend == -1:
1541                         video_ids = video_ids[playliststart:]
1542                 else:
1543                         video_ids = video_ids[playliststart:playlistend]
1544
1545                 for id in video_ids:
1546                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1547                 return
1548
1549
1550 class YoutubeUserIE(InfoExtractor):
1551         """Information Extractor for YouTube users."""
1552
1553         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1554         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1555         _GDATA_PAGE_SIZE = 50
1556         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1557         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1558         IE_NAME = u'youtube:user'
1559
1560         def __init__(self, downloader=None):
1561                 InfoExtractor.__init__(self, downloader)
1562
1563         def report_download_page(self, username, start_index):
1564                 """Report attempt to download user page."""
1565                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1566                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1567
1568         def _real_extract(self, url):
1569                 # Extract username
1570                 mobj = re.match(self._VALID_URL, url)
1571                 if mobj is None:
1572                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1573                         return
1574
1575                 username = mobj.group(1)
1576
1577                 # Download video ids using YouTube Data API. Result size per
1578                 # query is limited (currently to 50 videos) so we need to query
1579                 # page by page until there are no video ids - it means we got
1580                 # all of them.
1581
1582                 video_ids = []
1583                 pagenum = 0
1584
1585                 while True:
1586                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1587                         self.report_download_page(username, start_index)
1588
1589                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1590
1591                         try:
1592                                 page = urllib2.urlopen(request).read()
1593                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1594                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1595                                 return
1596
1597                         # Extract video identifiers
1598                         ids_in_page = []
1599
1600                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1601                                 if mobj.group(1) not in ids_in_page:
1602                                         ids_in_page.append(mobj.group(1))
1603
1604                         video_ids.extend(ids_in_page)
1605
1606                         # A little optimization - if current page is not
1607                         # "full", ie. does not contain PAGE_SIZE video ids then
1608                         # we can assume that this page is the last one - there
1609                         # are no more ids on further pages - no need to query
1610                         # again.
1611
1612                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1613                                 break
1614
1615                         pagenum += 1
1616
1617                 all_ids_count = len(video_ids)
1618                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1619                 playlistend = self._downloader.params.get('playlistend', -1)
1620
1621                 if playlistend == -1:
1622                         video_ids = video_ids[playliststart:]
1623                 else:
1624                         video_ids = video_ids[playliststart:playlistend]
1625
1626                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1627                                 (username, all_ids_count, len(video_ids)))
1628
1629                 for video_id in video_ids:
1630                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1631
1632
1633 class BlipTVUserIE(InfoExtractor):
1634         """Information Extractor for blip.tv users."""
1635
1636         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1637         _PAGE_SIZE = 12
1638         IE_NAME = u'blip.tv:user'
1639
1640         def __init__(self, downloader=None):
1641                 InfoExtractor.__init__(self, downloader)
1642
1643         def report_download_page(self, username, pagenum):
1644                 """Report attempt to download user page."""
1645                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1646                                 (self.IE_NAME, username, pagenum))
1647
1648         def _real_extract(self, url):
1649                 # Extract username
1650                 mobj = re.match(self._VALID_URL, url)
1651                 if mobj is None:
1652                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1653                         return
1654
1655                 username = mobj.group(1)
1656
1657                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1658
1659                 request = urllib2.Request(url)
1660
1661                 try:
1662                         page = urllib2.urlopen(request).read().decode('utf-8')
1663                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1664                         page_base = page_base % mobj.group(1)
1665                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1666                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1667                         return
1668
1669
1670                 # Download video ids using BlipTV Ajax calls. Result size per
1671                 # query is limited (currently to 12 videos) so we need to query
1672                 # page by page until there are no video ids - it means we got
1673                 # all of them.
1674
1675                 video_ids = []
1676                 pagenum = 1
1677
1678                 while True:
1679                         self.report_download_page(username, pagenum)
1680
1681                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1682
1683                         try:
1684                                 page = urllib2.urlopen(request).read().decode('utf-8')
1685                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1687                                 return
1688
1689                         # Extract video identifiers
1690                         ids_in_page = []
1691
1692                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1693                                 if mobj.group(1) not in ids_in_page:
1694                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1695
1696                         video_ids.extend(ids_in_page)
1697
1698                         # A little optimization - if current page is not
1699                         # "full", ie. does not contain PAGE_SIZE video ids then
1700                         # we can assume that this page is the last one - there
1701                         # are no more ids on further pages - no need to query
1702                         # again.
1703
1704                         if len(ids_in_page) < self._PAGE_SIZE:
1705                                 break
1706
1707                         pagenum += 1
1708
1709                 all_ids_count = len(video_ids)
1710                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1711                 playlistend = self._downloader.params.get('playlistend', -1)
1712
1713                 if playlistend == -1:
1714                         video_ids = video_ids[playliststart:]
1715                 else:
1716                         video_ids = video_ids[playliststart:playlistend]
1717
1718                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1719                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1720
1721                 for video_id in video_ids:
1722                         self._downloader.download([u'http://blip.tv/'+video_id])
1723
1724
1725 class DepositFilesIE(InfoExtractor):
1726         """Information extractor for depositfiles.com"""
1727
1728         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1729         IE_NAME = u'DepositFiles'
1730
1731         def __init__(self, downloader=None):
1732                 InfoExtractor.__init__(self, downloader)
1733
1734         def report_download_webpage(self, file_id):
1735                 """Report webpage download."""
1736                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1737
1738         def report_extraction(self, file_id):
1739                 """Report information extraction."""
1740                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1741
1742         def _real_extract(self, url):
1743                 file_id = url.split('/')[-1]
1744                 # Rebuild url in english locale
1745                 url = 'http://depositfiles.com/en/files/' + file_id
1746
1747                 # Retrieve file webpage with 'Free download' button pressed
1748                 free_download_indication = { 'gateway_result' : '1' }
1749                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1750                 try:
1751                         self.report_download_webpage(file_id)
1752                         webpage = urllib2.urlopen(request).read()
1753                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1754                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1755                         return
1756
1757                 # Search for the real file URL
1758                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1759                 if (mobj is None) or (mobj.group(1) is None):
1760                         # Try to figure out reason of the error.
1761                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1762                         if (mobj is not None) and (mobj.group(1) is not None):
1763                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1764                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1765                         else:
1766                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1767                         return
1768
1769                 file_url = mobj.group(1)
1770                 file_extension = os.path.splitext(file_url)[1][1:]
1771
1772                 # Search for file title
1773                 mobj = re.search(r'<b title="(.*?)">', webpage)
1774                 if mobj is None:
1775                         self._downloader.trouble(u'ERROR: unable to extract title')
1776                         return
1777                 file_title = mobj.group(1).decode('utf-8')
1778
1779                 return [{
1780                         'provider': IE_NAME,
1781                         'id':           file_id.decode('utf-8'),
1782                         'url':          file_url.decode('utf-8'),
1783                         'uploader':     u'NA',
1784                         'upload_date':  u'NA',
1785                         'title':        file_title,
1786                         'ext':          file_extension.decode('utf-8'),
1787                         'format':       u'NA',
1788                         'player_url':   None,
1789                 }]
1790
1791
1792 class FacebookIE(InfoExtractor):
1793         """Information Extractor for Facebook"""
1794
1795         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1796         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1797         _NETRC_MACHINE = 'facebook'
1798         _available_formats = ['video', 'highqual', 'lowqual']
1799         _video_extensions = {
1800                 'video': 'mp4',
1801                 'highqual': 'mp4',
1802                 'lowqual': 'mp4',
1803         }
1804         IE_NAME = u'facebook'
1805
1806         def __init__(self, downloader=None):
1807                 InfoExtractor.__init__(self, downloader)
1808
1809         def _reporter(self, message):
1810                 """Add header and report message."""
1811                 self._downloader.to_screen(u'[facebook] %s' % message)
1812
1813         def report_login(self):
1814                 """Report attempt to log in."""
1815                 self._reporter(u'Logging in')
1816
1817         def report_video_webpage_download(self, video_id):
1818                 """Report attempt to download video webpage."""
1819                 self._reporter(u'%s: Downloading video webpage' % video_id)
1820
1821         def report_information_extraction(self, video_id):
1822                 """Report attempt to extract video information."""
1823                 self._reporter(u'%s: Extracting video information' % video_id)
1824
1825         def _parse_page(self, video_webpage):
1826                 """Extract video information from page"""
1827                 # General data
1828                 data = {'title': r'\("video_title", "(.*?)"\)',
1829                         'description': r'<div class="datawrap">(.*?)</div>',
1830                         'owner': r'\("video_owner_name", "(.*?)"\)',
1831                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1832                         }
1833                 video_info = {}
1834                 for piece in data.keys():
1835                         mobj = re.search(data[piece], video_webpage)
1836                         if mobj is not None:
1837                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1838
1839                 # Video urls
1840                 video_urls = {}
1841                 for fmt in self._available_formats:
1842                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1843                         if mobj is not None:
1844                                 # URL is in a Javascript segment inside an escaped Unicode format within
1845                                 # the generally utf-8 page
1846                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1847                 video_info['video_urls'] = video_urls
1848
1849                 return video_info
1850
1851         def _real_initialize(self):
1852                 if self._downloader is None:
1853                         return
1854
1855                 useremail = None
1856                 password = None
1857                 downloader_params = self._downloader.params
1858
1859                 # Attempt to use provided username and password or .netrc data
1860                 if downloader_params.get('username', None) is not None:
1861                         useremail = downloader_params['username']
1862                         password = downloader_params['password']
1863                 elif downloader_params.get('usenetrc', False):
1864                         try:
1865                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1866                                 if info is not None:
1867                                         useremail = info[0]
1868                                         password = info[2]
1869                                 else:
1870                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1871                         except (IOError, netrc.NetrcParseError), err:
1872                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1873                                 return
1874
1875                 if useremail is None:
1876                         return
1877
1878                 # Log in
1879                 login_form = {
1880                         'email': useremail,
1881                         'pass': password,
1882                         'login': 'Log+In'
1883                         }
1884                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1885                 try:
1886                         self.report_login()
1887                         login_results = urllib2.urlopen(request).read()
1888                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1889                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1890                                 return
1891                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1892                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1893                         return
1894
1895         def _real_extract(self, url):
1896                 mobj = re.match(self._VALID_URL, url)
1897                 if mobj is None:
1898                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1899                         return
1900                 video_id = mobj.group('ID')
1901
1902                 # Get video webpage
1903                 self.report_video_webpage_download(video_id)
1904                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1905                 try:
1906                         page = urllib2.urlopen(request)
1907                         video_webpage = page.read()
1908                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1909                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1910                         return
1911
1912                 # Start extracting information
1913                 self.report_information_extraction(video_id)
1914
1915                 # Extract information
1916                 video_info = self._parse_page(video_webpage)
1917
1918                 # uploader
1919                 if 'owner' not in video_info:
1920                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1921                         return
1922                 video_uploader = video_info['owner']
1923
1924                 # title
1925                 if 'title' not in video_info:
1926                         self._downloader.trouble(u'ERROR: unable to extract video title')
1927                         return
1928                 video_title = video_info['title']
1929                 video_title = video_title.decode('utf-8')
1930
1931                 # thumbnail image
1932                 if 'thumbnail' not in video_info:
1933                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1934                         video_thumbnail = ''
1935                 else:
1936                         video_thumbnail = video_info['thumbnail']
1937
1938                 # upload date
1939                 upload_date = u'NA'
1940                 if 'upload_date' in video_info:
1941                         upload_time = video_info['upload_date']
1942                         timetuple = email.utils.parsedate_tz(upload_time)
1943                         if timetuple is not None:
1944                                 try:
1945                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1946                                 except:
1947                                         pass
1948
1949                 # description
1950                 video_description = video_info.get('description', 'No description available.')
1951
1952                 url_map = video_info['video_urls']
1953                 if len(url_map.keys()) > 0:
1954                         # Decide which formats to download
1955                         req_format = self._downloader.params.get('format', None)
1956                         format_limit = self._downloader.params.get('format_limit', None)
1957
1958                         if format_limit is not None and format_limit in self._available_formats:
1959                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1960                         else:
1961                                 format_list = self._available_formats
1962                         existing_formats = [x for x in format_list if x in url_map]
1963                         if len(existing_formats) == 0:
1964                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1965                                 return
1966                         if req_format is None:
1967                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1968                         elif req_format == 'worst':
1969                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1970                         elif req_format == '-1':
1971                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1972                         else:
1973                                 # Specific format
1974                                 if req_format not in url_map:
1975                                         self._downloader.trouble(u'ERROR: requested format not available')
1976                                         return
1977                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1978
1979                 results = []
1980                 for format_param, video_real_url in video_url_list:
1981                         # Extension
1982                         video_extension = self._video_extensions.get(format_param, 'mp4')
1983
1984                         results.append({
1985                                 'provider': IE_NAME,
1986                                 'id':           video_id.decode('utf-8'),
1987                                 'url':          video_real_url.decode('utf-8'),
1988                                 'uploader':     video_uploader.decode('utf-8'),
1989                                 'upload_date':  upload_date,
1990                                 'title':        video_title,
1991                                 'ext':          video_extension.decode('utf-8'),
1992                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1993                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1994                                 'description':  video_description.decode('utf-8'),
1995                                 'player_url':   None,
1996                         })
1997                 return results
1998
1999 class BlipTVIE(InfoExtractor):
2000         """Information extractor for blip.tv"""
2001
2002         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2003         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2004         IE_NAME = u'blip.tv'
2005
2006         def report_extraction(self, file_id):
2007                 """Report information extraction."""
2008                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2009
2010         def report_direct_download(self, title):
2011                 """Report information extraction."""
2012                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2013
2014         def _real_extract(self, url):
2015                 mobj = re.match(self._VALID_URL, url)
2016                 if mobj is None:
2017                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2018                         return
2019
2020                 if '?' in url:
2021                         cchar = '&'
2022                 else:
2023                         cchar = '?'
2024                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2025                 request = urllib2.Request(json_url.encode('utf-8'))
2026                 self.report_extraction(mobj.group(1))
2027                 info = None
2028                 try:
2029                         urlh = urllib2.urlopen(request)
2030                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2031                                 basename = url.split('/')[-1]
2032                                 title,ext = os.path.splitext(basename)
2033                                 title = title.decode('UTF-8')
2034                                 ext = ext.replace('.', '')
2035                                 self.report_direct_download(title)
2036                                 info = {
2037                                         'provider': IE_NAME,
2038                                         'id': title,
2039                                         'url': url,
2040                                         'title': title,
2041                                         'ext': ext,
2042                                         'urlhandle': urlh
2043                                 }
2044                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2045                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2046                         return
2047                 if info is None: # Regular URL
2048                         try:
2049                                 json_code = urlh.read()
2050                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2051                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2052                                 return
2053
2054                         try:
2055                                 json_data = json.loads(json_code)
2056                                 if 'Post' in json_data:
2057                                         data = json_data['Post']
2058                                 else:
2059                                         data = json_data
2060
2061                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2062                                 video_url = data['media']['url']
2063                                 umobj = re.match(self._URL_EXT, video_url)
2064                                 if umobj is None:
2065                                         raise ValueError('Can not determine filename extension')
2066                                 ext = umobj.group(1)
2067
2068                                 info = {
2069                                         'provider': IE_NAME,
2070                                         'id': data['item_id'],
2071                                         'url': video_url,
2072                                         'uploader': data['display_name'],
2073                                         'upload_date': upload_date,
2074                                         'title': data['title'],
2075                                         'ext': ext,
2076                                         'format': data['media']['mimeType'],
2077                                         'thumbnail': data['thumbnailUrl'],
2078                                         'description': data['description'],
2079                                         'player_url': data['embedUrl']
2080                                 }
2081                         except (ValueError,KeyError), err:
2082                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2083                                 return
2084
2085                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2086                 return [info]
2087
2088
2089 class MyVideoIE(InfoExtractor):
2090         """Information Extractor for myvideo.de."""
2091
2092         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2093         IE_NAME = u'myvideo'
2094
2095         def __init__(self, downloader=None):
2096                 InfoExtractor.__init__(self, downloader)
2097
2098         def report_download_webpage(self, video_id):
2099                 """Report webpage download."""
2100                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2101
2102         def report_extraction(self, video_id):
2103                 """Report information extraction."""
2104                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2105
2106         def _real_extract(self,url):
2107                 mobj = re.match(self._VALID_URL, url)
2108                 if mobj is None:
2109                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2110                         return
2111
2112                 video_id = mobj.group(1)
2113
2114                 # Get video webpage
2115                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2116                 try:
2117                         self.report_download_webpage(video_id)
2118                         webpage = urllib2.urlopen(request).read()
2119                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2120                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2121                         return
2122
2123                 self.report_extraction(video_id)
2124                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2125                                  webpage)
2126                 if mobj is None:
2127                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2128                         return
2129                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2130
2131                 mobj = re.search('<title>([^<]+)</title>', webpage)
2132                 if mobj is None:
2133                         self._downloader.trouble(u'ERROR: unable to extract title')
2134                         return
2135
2136                 video_title = mobj.group(1)
2137
2138                 return [{
2139                         'provider': IE_NAME,
2140                         'id':           video_id,
2141                         'url':          video_url,
2142                         'uploader':     u'NA',
2143                         'upload_date':  u'NA',
2144                         'title':        video_title,
2145                         'ext':          u'flv',
2146                         'format':       u'NA',
2147                         'player_url':   None,
2148                 }]
2149
2150 class ComedyCentralIE(InfoExtractor):
2151         """Information extractor for The Daily Show and Colbert Report """
2152
2153         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2154         IE_NAME = u'comedycentral'
2155
2156         def report_extraction(self, episode_id):
2157                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2158
2159         def report_config_download(self, episode_id):
2160                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2161
2162         def report_index_download(self, episode_id):
2163                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2164
2165         def report_player_url(self, episode_id):
2166                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2167
2168         def _real_extract(self, url):
2169                 mobj = re.match(self._VALID_URL, url)
2170                 if mobj is None:
2171                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2172                         return
2173
2174                 if mobj.group('shortname'):
2175                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2176                                 url = u'http://www.thedailyshow.com/full-episodes/'
2177                         else:
2178                                 url = u'http://www.colbertnation.com/full-episodes/'
2179                         mobj = re.match(self._VALID_URL, url)
2180                         assert mobj is not None
2181
2182                 dlNewest = not mobj.group('episode')
2183                 if dlNewest:
2184                         epTitle = mobj.group('showname')
2185                 else:
2186                         epTitle = mobj.group('episode')
2187
2188                 req = urllib2.Request(url)
2189                 self.report_extraction(epTitle)
2190                 try:
2191                         htmlHandle = urllib2.urlopen(req)
2192                         html = htmlHandle.read()
2193                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2194                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2195                         return
2196                 if dlNewest:
2197                         url = htmlHandle.geturl()
2198                         mobj = re.match(self._VALID_URL, url)
2199                         if mobj is None:
2200                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2201                                 return
2202                         if mobj.group('episode') == '':
2203                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2204                                 return
2205                         epTitle = mobj.group('episode')
2206
2207                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2208                 if len(mMovieParams) == 0:
2209                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2210                         return
2211
2212                 playerUrl_raw = mMovieParams[0][0]
2213                 self.report_player_url(epTitle)
2214                 try:
2215                         urlHandle = urllib2.urlopen(playerUrl_raw)
2216                         playerUrl = urlHandle.geturl()
2217                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2218                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2219                         return
2220
2221                 uri = mMovieParams[0][1]
2222                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2223                 self.report_index_download(epTitle)
2224                 try:
2225                         indexXml = urllib2.urlopen(indexUrl).read()
2226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2227                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2228                         return
2229
2230                 results = []
2231
2232                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2233                 itemEls = idoc.findall('.//item')
2234                 for itemEl in itemEls:
2235                         mediaId = itemEl.findall('./guid')[0].text
2236                         shortMediaId = mediaId.split(':')[-1]
2237                         showId = mediaId.split(':')[-2].replace('.com', '')
2238                         officialTitle = itemEl.findall('./title')[0].text
2239                         officialDate = itemEl.findall('./pubDate')[0].text
2240
2241                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2242                                                 urllib.urlencode({'uri': mediaId}))
2243                         configReq = urllib2.Request(configUrl)
2244                         self.report_config_download(epTitle)
2245                         try:
2246                                 configXml = urllib2.urlopen(configReq).read()
2247                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2248                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2249                                 return
2250
2251                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2252                         turls = []
2253                         for rendition in cdoc.findall('.//rendition'):
2254                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2255                                 turls.append(finfo)
2256
2257                         if len(turls) == 0:
2258                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2259                                 continue
2260
2261                         # For now, just pick the highest bitrate
2262                         format,video_url = turls[-1]
2263
2264                         effTitle = showId + u'-' + epTitle
2265                         info = {
2266                                 'provider': IE_NAME,
2267                                 'id': shortMediaId,
2268                                 'url': video_url,
2269                                 'uploader': showId,
2270                                 'upload_date': officialDate,
2271                                 'title': effTitle,
2272                                 'ext': 'mp4',
2273                                 'format': format,
2274                                 'thumbnail': None,
2275                                 'description': officialTitle,
2276                                 'player_url': playerUrl
2277                         }
2278
2279                         results.append(info)
2280
2281                 return results
2282
2283
2284 class EscapistIE(InfoExtractor):
2285         """Information extractor for The Escapist """
2286
2287         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2288         IE_NAME = u'escapist'
2289
2290         def report_extraction(self, showName):
2291                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2292
2293         def report_config_download(self, showName):
2294                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2295
2296         def _real_extract(self, url):
2297                 mobj = re.match(self._VALID_URL, url)
2298                 if mobj is None:
2299                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2300                         return
2301                 showName = mobj.group('showname')
2302                 videoId = mobj.group('episode')
2303
2304                 self.report_extraction(showName)
2305                 try:
2306                         webPage = urllib2.urlopen(url)
2307                         webPageBytes = webPage.read()
2308                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2309                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2312                         return
2313
2314                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2315                 description = unescapeHTML(descMatch.group(1))
2316                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2317                 imgUrl = unescapeHTML(imgMatch.group(1))
2318                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2319                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2320                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2321                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2322
2323                 self.report_config_download(showName)
2324                 try:
2325                         configJSON = urllib2.urlopen(configUrl).read()
2326                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2327                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2328                         return
2329
2330                 # Technically, it's JavaScript, not JSON
2331                 configJSON = configJSON.replace("'", '"')
2332
2333                 try:
2334                         config = json.loads(configJSON)
2335                 except (ValueError,), err:
2336                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2337                         return
2338
2339                 playlist = config['playlist']
2340                 videoUrl = playlist[1]['url']
2341
2342                 info = {
2343                         'provider': IE_NAME,
2344                         'id': videoId,
2345                         'url': videoUrl,
2346                         'uploader': showName,
2347                         'upload_date': None,
2348                         'title': showName,
2349                         'ext': 'flv',
2350                         'format': 'flv',
2351                         'thumbnail': imgUrl,
2352                         'description': description,
2353                         'player_url': playerUrl,
2354                 }
2355
2356                 return [info]
2357
2358
2359 class CollegeHumorIE(InfoExtractor):
2360         """Information extractor for collegehumor.com"""
2361
2362         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2363         IE_NAME = u'collegehumor'
2364
2365         def report_webpage(self, video_id):
2366                 """Report information extraction."""
2367                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2368
2369         def report_extraction(self, video_id):
2370                 """Report information extraction."""
2371                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2372
2373         def _real_extract(self, url):
2374                 mobj = re.match(self._VALID_URL, url)
2375                 if mobj is None:
2376                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2377                         return
2378                 video_id = mobj.group('videoid')
2379
2380                 self.report_webpage(video_id)
2381                 request = urllib2.Request(url)
2382                 try:
2383                         webpage = urllib2.urlopen(request).read()
2384                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2385                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2386                         return
2387
2388                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2389                 if m is None:
2390                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2391                         return
2392                 internal_video_id = m.group('internalvideoid')
2393
2394                 info = {
2395                         'provider': IE_NAME,
2396                         'id': video_id,
2397                         'internal_id': internal_video_id,
2398                 }
2399
2400                 self.report_extraction(video_id)
2401                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2402                 try:
2403                         metaXml = urllib2.urlopen(xmlUrl).read()
2404                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2406                         return
2407
2408                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2409                 try:
2410                         videoNode = mdoc.findall('./video')[0]
2411                         info['description'] = videoNode.findall('./description')[0].text
2412                         info['title'] = videoNode.findall('./caption')[0].text
2413                         info['url'] = videoNode.findall('./file')[0].text
2414                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2415                         info['ext'] = info['url'].rpartition('.')[2]
2416                         info['format'] = info['ext']
2417                 except IndexError:
2418                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2419                         return
2420
2421                 return [info]
2422
2423
2424 class XVideosIE(InfoExtractor):
2425         """Information extractor for xvideos.com"""
2426
2427         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2428         IE_NAME = u'xvideos'
2429
2430         def report_webpage(self, video_id):
2431                 """Report information extraction."""
2432                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2433
2434         def report_extraction(self, video_id):
2435                 """Report information extraction."""
2436                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2437
2438         def _real_extract(self, url):
2439                 mobj = re.match(self._VALID_URL, url)
2440                 if mobj is None:
2441                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2442                         return
2443                 video_id = mobj.group(1).decode('utf-8')
2444
2445                 self.report_webpage(video_id)
2446
2447                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2448                 try:
2449                         webpage = urllib2.urlopen(request).read()
2450                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2451                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2452                         return
2453
2454                 self.report_extraction(video_id)
2455
2456
2457                 # Extract video URL
2458                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2459                 if mobj is None:
2460                         self._downloader.trouble(u'ERROR: unable to extract video url')
2461                         return
2462                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2463
2464
2465                 # Extract title
2466                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2467                 if mobj is None:
2468                         self._downloader.trouble(u'ERROR: unable to extract video title')
2469                         return
2470                 video_title = mobj.group(1).decode('utf-8')
2471
2472
2473                 # Extract video thumbnail
2474                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2475                 if mobj is None:
2476                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2477                         return
2478                 video_thumbnail = mobj.group(0).decode('utf-8')
2479
2480                 info = {
2481                         'provider': IE_NAME,
2482                         'id': video_id,
2483                         'url': video_url,
2484                         'uploader': None,
2485                         'upload_date': None,
2486                         'title': video_title,
2487                         'ext': 'flv',
2488                         'format': 'flv',
2489                         'thumbnail': video_thumbnail,
2490                         'description': None,
2491                         'player_url': None,
2492                 }
2493
2494                 return [info]
2495
2496
2497 class SoundcloudIE(InfoExtractor):
2498         """Information extractor for soundcloud.com
2499            To access the media, the uid of the song and a stream token
2500            must be extracted from the page source and the script must make
2501            a request to media.soundcloud.com/crossdomain.xml. Then
2502            the media can be grabbed by requesting from an url composed
2503            of the stream token and uid
2504          """
2505
2506         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2507         IE_NAME = u'soundcloud'
2508
2509         def __init__(self, downloader=None):
2510                 InfoExtractor.__init__(self, downloader)
2511
2512         def report_webpage(self, video_id):
2513                 """Report information extraction."""
2514                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2515
2516         def report_extraction(self, video_id):
2517                 """Report information extraction."""
2518                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2519
2520         def _real_extract(self, url):
2521                 mobj = re.match(self._VALID_URL, url)
2522                 if mobj is None:
2523                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2524                         return
2525
2526                 # extract uploader (which is in the url)
2527                 uploader = mobj.group(1).decode('utf-8')
2528                 # extract simple title (uploader + slug of song title)
2529                 slug_title =  mobj.group(2).decode('utf-8')
2530                 simple_title = uploader + u'-' + slug_title
2531
2532                 self.report_webpage('%s/%s' % (uploader, slug_title))
2533
2534                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2535                 try:
2536                         webpage = urllib2.urlopen(request).read()
2537                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2538                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2539                         return
2540
2541                 self.report_extraction('%s/%s' % (uploader, slug_title))
2542
2543                 # extract uid and stream token that soundcloud hands out for access
2544                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2545                 if mobj:
2546                         video_id = mobj.group(1)
2547                         stream_token = mobj.group(2)
2548
2549                 # extract unsimplified title
2550                 mobj = re.search('"title":"(.*?)",', webpage)
2551                 if mobj:
2552                         title = mobj.group(1).decode('utf-8')
2553                 else:
2554                         title = simple_title
2555
2556                 # construct media url (with uid/token)
2557                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2558                 mediaURL = mediaURL % (video_id, stream_token)
2559
2560                 # description
2561                 description = u'No description available'
2562                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2563                 if mobj:
2564                         description = mobj.group(1)
2565
2566                 # upload date
2567                 upload_date = None
2568                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2569                 if mobj:
2570                         try:
2571                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2572                         except Exception, e:
2573                                 self._downloader.to_stderr(str(e))
2574
2575                 # for soundcloud, a request to a cross domain is required for cookies
2576                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2577
2578                 return [{
2579                         'provider': IE_NAME,
2580                         'id':           video_id.decode('utf-8'),
2581                         'url':          mediaURL,
2582                         'uploader':     uploader.decode('utf-8'),
2583                         'upload_date':  upload_date,
2584                         'title':        title,
2585                         'ext':          u'mp3',
2586                         'format':       u'NA',
2587                         'player_url':   None,
2588                         'description': description.decode('utf-8')
2589                 }]
2590
2591
2592 class InfoQIE(InfoExtractor):
2593         """Information extractor for infoq.com"""
2594
2595         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2596         IE_NAME = u'infoq'
2597
2598         def report_webpage(self, video_id):
2599                 """Report information extraction."""
2600                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2601
2602         def report_extraction(self, video_id):
2603                 """Report information extraction."""
2604                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2605
2606         def _real_extract(self, url):
2607                 mobj = re.match(self._VALID_URL, url)
2608                 if mobj is None:
2609                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2610                         return
2611
2612                 self.report_webpage(url)
2613
2614                 request = urllib2.Request(url)
2615                 try:
2616                         webpage = urllib2.urlopen(request).read()
2617                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2619                         return
2620
2621                 self.report_extraction(url)
2622
2623
2624                 # Extract video URL
2625                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2626                 if mobj is None:
2627                         self._downloader.trouble(u'ERROR: unable to extract video url')
2628                         return
2629                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2630
2631
2632                 # Extract title
2633                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2634                 if mobj is None:
2635                         self._downloader.trouble(u'ERROR: unable to extract video title')
2636                         return
2637                 video_title = mobj.group(1).decode('utf-8')
2638
2639                 # Extract description
2640                 video_description = u'No description available.'
2641                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2642                 if mobj is not None:
2643                         video_description = mobj.group(1).decode('utf-8')
2644
2645                 video_filename = video_url.split('/')[-1]
2646                 video_id, extension = video_filename.split('.')
2647
2648                 info = {
2649                         'provider': IE_NAME,
2650                         'id': video_id,
2651                         'url': video_url,
2652                         'uploader': None,
2653                         'upload_date': None,
2654                         'title': video_title,
2655                         'ext': extension,
2656                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2657                         'thumbnail': None,
2658                         'description': video_description,
2659                         'player_url': None,
2660                 }
2661
2662                 return [info]
2663
2664 class MixcloudIE(InfoExtractor):
2665         """Information extractor for www.mixcloud.com"""
2666         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2667         IE_NAME = u'mixcloud'
2668
2669         def __init__(self, downloader=None):
2670                 InfoExtractor.__init__(self, downloader)
2671
2672         def report_download_json(self, file_id):
2673                 """Report JSON download."""
2674                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2675
2676         def report_extraction(self, file_id):
2677                 """Report information extraction."""
2678                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2679
2680         def get_urls(self, jsonData, fmt, bitrate='best'):
2681                 """Get urls from 'audio_formats' section in json"""
2682                 file_url = None
2683                 try:
2684                         bitrate_list = jsonData[fmt]
2685                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2686                                 bitrate = max(bitrate_list) # select highest
2687
2688                         url_list = jsonData[fmt][bitrate]
2689                 except TypeError: # we have no bitrate info.
2690                         url_list = jsonData[fmt]
2691                 return url_list
2692
2693         def check_urls(self, url_list):
2694                 """Returns 1st active url from list"""
2695                 for url in url_list:
2696                         try:
2697                                 urllib2.urlopen(url)
2698                                 return url
2699                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2700                                 url = None
2701
2702                 return None
2703
2704         def _print_formats(self, formats):
2705                 print 'Available formats:'
2706                 for fmt in formats.keys():
2707                         for b in formats[fmt]:
2708                                 try:
2709                                         ext = formats[fmt][b][0]
2710                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2711                                 except TypeError: # we have no bitrate info
2712                                         ext = formats[fmt][0]
2713                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2714                                         break
2715
2716         def _real_extract(self, url):
2717                 mobj = re.match(self._VALID_URL, url)
2718                 if mobj is None:
2719                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2720                         return
2721                 # extract uploader & filename from url
2722                 uploader = mobj.group(1).decode('utf-8')
2723                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2724
2725                 # construct API request
2726                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2727                 # retrieve .json file with links to files
2728                 request = urllib2.Request(file_url)
2729                 try:
2730                         self.report_download_json(file_url)
2731                         jsonData = urllib2.urlopen(request).read()
2732                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2733                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2734                         return
2735
2736                 # parse JSON
2737                 json_data = json.loads(jsonData)
2738                 player_url = json_data['player_swf_url']
2739                 formats = dict(json_data['audio_formats'])
2740
2741                 req_format = self._downloader.params.get('format', None)
2742                 bitrate = None
2743
2744                 if self._downloader.params.get('listformats', None):
2745                         self._print_formats(formats)
2746                         return
2747
2748                 if req_format is None or req_format == 'best':
2749                         for format_param in formats.keys():
2750                                 url_list = self.get_urls(formats, format_param)
2751                                 # check urls
2752                                 file_url = self.check_urls(url_list)
2753                                 if file_url is not None:
2754                                         break # got it!
2755                 else:
2756                         if req_format not in formats.keys():
2757                                 self._downloader.trouble(u'ERROR: format is not available')
2758                                 return
2759
2760                         url_list = self.get_urls(formats, req_format)
2761                         file_url = self.check_urls(url_list)
2762                         format_param = req_format
2763
2764                 return [{
2765                         'provider': IE_NAME,
2766                         'id': file_id.decode('utf-8'),
2767                         'url': file_url.decode('utf-8'),
2768                         'uploader':     uploader.decode('utf-8'),
2769                         'upload_date': u'NA',
2770                         'title': json_data['name'],
2771                         'ext': file_url.split('.')[-1].decode('utf-8'),
2772                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2773                         'thumbnail': json_data['thumbnail_url'],
2774                         'description': json_data['description'],
2775                         'player_url': player_url.decode('utf-8'),
2776                 }]
2777
2778 class StanfordOpenClassroomIE(InfoExtractor):
2779         """Information extractor for Stanford's Open ClassRoom"""
2780
2781         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2782         IE_NAME = u'stanfordoc'
2783
2784         def report_download_webpage(self, objid):
2785                 """Report information extraction."""
2786                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2787
2788         def report_extraction(self, video_id):
2789                 """Report information extraction."""
2790                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2791
2792         def _real_extract(self, url):
2793                 mobj = re.match(self._VALID_URL, url)
2794                 if mobj is None:
2795                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2796                         return
2797
2798                 if mobj.group('course') and mobj.group('video'): # A specific video
2799                         course = mobj.group('course')
2800                         video = mobj.group('video')
2801                         info = {
2802                                 'provider': IE_NAME,
2803                                 'id': course + '_' + video,
2804                         }
2805
2806                         self.report_extraction(info['id'])
2807                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808                         xmlUrl = baseUrl + video + '.xml'
2809                         try:
2810                                 metaXml = urllib2.urlopen(xmlUrl).read()
2811                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2812                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2813                                 return
2814                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2815                         try:
2816                                 info['title'] = mdoc.findall('./title')[0].text
2817                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2818                         except IndexError:
2819                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2820                                 return
2821                         info['ext'] = info['url'].rpartition('.')[2]
2822                         info['format'] = info['ext']
2823                         return [info]
2824                 elif mobj.group('course'): # A course page
2825                         course = mobj.group('course')
2826                         info = {
2827                                 'provider': IE_NAME,
2828                                 'id': course,
2829                                 'type': 'playlist',
2830                         }
2831
2832                         self.report_download_webpage(info['id'])
2833                         try:
2834                                 coursepage = urllib2.urlopen(url).read()
2835                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2836                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2837                                 return
2838
2839                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2840                         if m:
2841                                 info['title'] = unescapeHTML(m.group(1))
2842                         else:
2843                                 info['title'] = info['id']
2844
2845                         m = re.search('<description>([^<]+)</description>', coursepage)
2846                         if m:
2847                                 info['description'] = unescapeHTML(m.group(1))
2848
2849                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2850                         info['list'] = [
2851                                 {
2852                                         'type': 'reference',
2853                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2854                                 }
2855                                         for vpage in links]
2856                         results = []
2857                         for entry in info['list']:
2858                                 assert entry['type'] == 'reference'
2859                                 results += self.extract(entry['url'])
2860                         return results
2861
2862                 else: # Root page
2863                         info = {
2864                                 'provider': IE_NAME,
2865                                 'id': 'Stanford OpenClassroom',
2866                                 'type': 'playlist',
2867                         }
2868
2869                         self.report_download_webpage(info['id'])
2870                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2871                         try:
2872                                 rootpage = urllib2.urlopen(rootURL).read()
2873                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2874                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2875                                 return
2876
2877                         info['title'] = info['id']
2878
2879                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2880                         info['list'] = [
2881                                 {
2882                                         'type': 'reference',
2883                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2884                                 }
2885                                         for cpage in links]
2886
2887                         results = []
2888                         for entry in info['list']:
2889                                 assert entry['type'] == 'reference'
2890                                 results += self.extract(entry['url'])
2891                         return results
2892
2893 class MTVIE(InfoExtractor):
2894         """Information extractor for MTV.com"""
2895
2896         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2897         IE_NAME = u'mtv'
2898
2899         def report_webpage(self, video_id):
2900                 """Report information extraction."""
2901                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2902
2903         def report_extraction(self, video_id):
2904                 """Report information extraction."""
2905                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2906
2907         def _real_extract(self, url):
2908                 mobj = re.match(self._VALID_URL, url)
2909                 if mobj is None:
2910                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2911                         return
2912                 if not mobj.group('proto'):
2913                         url = 'http://' + url
2914                 video_id = mobj.group('videoid')
2915                 self.report_webpage(video_id)
2916
2917                 request = urllib2.Request(url)
2918                 try:
2919                         webpage = urllib2.urlopen(request).read()
2920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2921                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2922                         return
2923
2924                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2925                 if mobj is None:
2926                         self._downloader.trouble(u'ERROR: unable to extract song name')
2927                         return
2928                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2929                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2930                 if mobj is None:
2931                         self._downloader.trouble(u'ERROR: unable to extract performer')
2932                         return
2933                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2934                 video_title = performer + ' - ' + song_name
2935
2936                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2937                 if mobj is None:
2938                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2939                         return
2940                 mtvn_uri = mobj.group(1)
2941
2942                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2943                 if mobj is None:
2944                         self._downloader.trouble(u'ERROR: unable to extract content id')
2945                         return
2946                 content_id = mobj.group(1)
2947
2948                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2949                 self.report_extraction(video_id)
2950                 request = urllib2.Request(videogen_url)
2951                 try:
2952                         metadataXml = urllib2.urlopen(request).read()
2953                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2954                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2955                         return
2956
2957                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2958                 renditions = mdoc.findall('.//rendition')
2959
2960                 # For now, always pick the highest quality.
2961                 rendition = renditions[-1]
2962
2963                 try:
2964                         _,_,ext = rendition.attrib['type'].partition('/')
2965                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2966                         video_url = rendition.find('./src').text
2967                 except KeyError:
2968                         self._downloader.trouble('Invalid rendition field.')
2969                         return
2970
2971                 info = {
2972                         'provider': IE_NAME,
2973                         'id': video_id,
2974                         'url': video_url,
2975                         'uploader': performer,
2976                         'title': video_title,
2977                         'ext': ext,
2978                         'format': format,
2979                 }
2980
2981                 return [info]