_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube\.majestyc\.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
 101         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 102         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 103         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 104         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 105         _NETRC_MACHINE = 'youtube'
 106         # Listed in order of quality
 107         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 108         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 109         _video_extensions = {
 110                 '13': '3gp',
 111                 '17': 'mp4',
 112                 '18': 'mp4',
 113                 '22': 'mp4',
 114                 '37': 'mp4',
 115                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 116                 '43': 'webm',
 117                 '44': 'webm',
 118                 '45': 'webm',
 119                 '46': 'webm',
 120         }
 121         _video_dimensions = {
 122                 '5': '240x400',
 123                 '6': '???',
 124                 '13': '???',
 125                 '17': '144x176',
 126                 '18': '360x640',
 127                 '22': '720x1280',
 128                 '34': '360x640',
 129                 '35': '480x854',
 130                 '37': '1080x1920',
 131                 '38': '3072x4096',
 132                 '43': '360x640',
 133                 '44': '480x854',
 134                 '45': '720x1280',
 135                 '46': '1080x1920',
 136         }
 137         IE_NAME = u'youtube'
 138
 139         def report_lang(self):
 140                 """Report attempt to set language."""
 141                 self._downloader.to_screen(u'[youtube] Setting language')
 142
 143         def report_login(self):
 144                 """Report attempt to log in."""
 145                 self._downloader.to_screen(u'[youtube] Logging in')
 146
 147         def report_age_confirmation(self):
 148                 """Report attempt to confirm age."""
 149                 self._downloader.to_screen(u'[youtube] Confirming age')
 150
 151         def report_video_webpage_download(self, video_id):
 152                 """Report attempt to download video webpage."""
 153                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 154
 155         def report_video_info_webpage_download(self, video_id):
 156                 """Report attempt to download video info webpage."""
 157                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 158
 159         def report_video_subtitles_download(self, video_id):
 160                 """Report attempt to download video info webpage."""
 161                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 162
 163         def report_information_extraction(self, video_id):
 164                 """Report attempt to extract video information."""
 165                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 166
 167         def report_unavailable_format(self, video_id, format):
 168                 """Report extracted video URL."""
 169                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 170
 171         def report_rtmp_download(self):
 172                 """Indicate the download will use the RTMP protocol."""
 173                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 174
 175         def _closed_captions_xml_to_srt(self, xml_string):
 176                 srt = ''
 177                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 178                 # TODO parse xml instead of regex
 179                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 180                         if not dur: dur = '4'
 181                         start = float(start)
 182                         end = start + float(dur)
 183                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 184                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 185                         caption = unescapeHTML(caption)
 186                         caption = unescapeHTML(caption) # double cycle, intentional
 187                         srt += str(n+1) + '\n'
 188                         srt += start + ' --> ' + end + '\n'
 189                         srt += caption + '\n\n'
 190                 return srt
 191
 192         def _print_formats(self, formats):
 193                 print 'Available formats:'
 194                 for x in formats:
 195                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 196
 197         def _real_initialize(self):
 198                 if self._downloader is None:
 199                         return
 200
 201                 username = None
 202                 password = None
 203                 downloader_params = self._downloader.params
 204
 205                 # Attempt to use provided username and password or .netrc data
 206                 if downloader_params.get('username', None) is not None:
 207                         username = downloader_params['username']
 208                         password = downloader_params['password']
 209                 elif downloader_params.get('usenetrc', False):
 210                         try:
 211                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 212                                 if info is not None:
 213                                         username = info[0]
 214                                         password = info[2]
 215                                 else:
 216                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 217                         except (IOError, netrc.NetrcParseError), err:
 218                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 219                                 return
 220
 221                 # Set language
 222                 request = urllib2.Request(self._LANG_URL)
 223                 try:
 224                         self.report_lang()
 225                         urllib2.urlopen(request).read()
 226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 227                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 228                         return
 229
 230                 # No authentication to be performed
 231                 if username is None:
 232                         return
 233
 234                 # Log in
 235                 login_form = {
 236                                 'current_form': 'loginForm',
 237                                 'next':         '/',
 238                                 'action_login': 'Log In',
 239                                 'username':     username,
 240                                 'password':     password,
 241                                 }
 242                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 243                 try:
 244                         self.report_login()
 245                         login_results = urllib2.urlopen(request).read()
 246                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 247                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 248                                 return
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 251                         return
 252
 253                 # Confirm age
 254                 age_form = {
 255                                 'next_url':             '/',
 256                                 'action_confirm':       'Confirm',
 257                                 }
 258                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 259                 try:
 260                         self.report_age_confirmation()
 261                         age_results = urllib2.urlopen(request).read()
 262                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 263                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 264                         return
 265
 266         def _real_extract(self, url):
 267                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 268                 mobj = re.search(self._NEXT_URL_RE, url)
 269                 if mobj:
 270                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 271
 272                 # Extract video id from URL
 273                 mobj = re.match(self._VALID_URL, url)
 274                 if mobj is None:
 275                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 276                         return
 277                 video_id = mobj.group(2)
 278
 279                 # Get video webpage
 280                 self.report_video_webpage_download(video_id)
 281                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 282                 try:
 283                         video_webpage = urllib2.urlopen(request).read()
 284                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 285                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 286                         return
 287
 288                 # Attempt to extract SWF player URL
 289                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 290                 if mobj is not None:
 291                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 292                 else:
 293                         player_url = None
 294
 295                 # Get video info
 296                 self.report_video_info_webpage_download(video_id)
 297                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 298                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 299                                         % (video_id, el_type))
 300                         request = urllib2.Request(video_info_url)
 301                         try:
 302                                 video_info_webpage = urllib2.urlopen(request).read()
 303                                 video_info = parse_qs(video_info_webpage)
 304                                 if 'token' in video_info:
 305                                         break
 306                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 307                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 308                                 return
 309                 if 'token' not in video_info:
 310                         if 'reason' in video_info:
 311                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 312                         else:
 313                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 314                         return
 315
 316                 # Check for "rental" videos
 317                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 318                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 319                         return
 320
 321                 # Start extracting information
 322                 self.report_information_extraction(video_id)
 323
 324                 # uploader
 325                 if 'author' not in video_info:
 326                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 327                         return
 328                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 329
 330                 # title
 331                 if 'title' not in video_info:
 332                         self._downloader.trouble(u'ERROR: unable to extract video title')
 333                         return
 334                 video_title = urllib.unquote_plus(video_info['title'][0])
 335                 video_title = video_title.decode('utf-8')
 336
 337                 # thumbnail image
 338                 if 'thumbnail_url' not in video_info:
 339                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 340                         video_thumbnail = ''
 341                 else:   # don't panic if we can't find it
 342                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 343
 344                 # upload date
 345                 upload_date = u'NA'
 346                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 347                 if mobj is not None:
 348                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 349                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 350                         for expression in format_expressions:
 351                                 try:
 352                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 353                                 except:
 354                                         pass
 355
 356                 # description
 357                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 358                 if video_description: video_description = clean_html(video_description)
 359                 else: video_description = ''
 360
 361                 # closed captions
 362                 video_subtitles = None
 363                 if self._downloader.params.get('writesubtitles', False):
 364                         try:
 365                                 self.report_video_subtitles_download(video_id)
 366                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 367                                 try:
 368                                         srt_list = urllib2.urlopen(request).read()
 369                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 370                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 371                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 372                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 373                                 if not srt_lang_list:
 374                                         raise Trouble(u'WARNING: video has no closed captions')
 375                                 if self._downloader.params.get('subtitleslang', False):
 376                                         srt_lang = self._downloader.params.get('subtitleslang')
 377                                 elif 'en' in srt_lang_list:
 378                                         srt_lang = 'en'
 379                                 else:
 380                                         srt_lang = srt_lang_list.keys()[0]
 381                                 if not srt_lang in srt_lang_list:
 382                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 383                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 384                                 try:
 385                                         srt_xml = urllib2.urlopen(request).read()
 386                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 387                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 388                                 if not srt_xml:
 389                                         raise Trouble(u'WARNING: unable to download video subtitles')
 390                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 391                         except Trouble as trouble:
 392                                 self._downloader.trouble(trouble[0])
 393
 394                 # token
 395                 video_token = urllib.unquote_plus(video_info['token'][0])
 396
 397                 # Decide which formats to download
 398                 req_format = self._downloader.params.get('format', None)
 399
 400                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 401                         self.report_rtmp_download()
 402                         video_url_list = [(None, video_info['conn'][0])]
 403                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 404                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 405                         url_data = [parse_qs(uds) for uds in url_data_strs]
 406                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 407                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 408
 409                         format_limit = self._downloader.params.get('format_limit', None)
 410                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 411                         if format_limit is not None and format_limit in available_formats:
 412                                 format_list = available_formats[available_formats.index(format_limit):]
 413                         else:
 414                                 format_list = available_formats
 415                         existing_formats = [x for x in format_list if x in url_map]
 416                         if len(existing_formats) == 0:
 417                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 418                                 return
 419                         if self._downloader.params.get('listformats', None):
 420                                 self._print_formats(existing_formats)
 421                                 return
 422                         if req_format is None or req_format == 'best':
 423                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 424                         elif req_format == 'worst':
 425                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 426                         elif req_format in ('-1', 'all'):
 427                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 428                         else:
 429                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 430                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 431                                 req_formats = req_format.split('/')
 432                                 video_url_list = None
 433                                 for rf in req_formats:
 434                                         if rf in url_map:
 435                                                 video_url_list = [(rf, url_map[rf])]
 436                                                 break
 437                                 if video_url_list is None:
 438                                         self._downloader.trouble(u'ERROR: requested format not available')
 439                                         return
 440                 else:
 441                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 442                         return
 443
 444                 results = []
 445                 for format_param, video_real_url in video_url_list:
 446                         # Extension
 447                         video_extension = self._video_extensions.get(format_param, 'flv')
 448
 449                         results.append({
 450                                 'id':           video_id.decode('utf-8'),
 451                                 'url':          video_real_url.decode('utf-8'),
 452                                 'uploader':     video_uploader.decode('utf-8'),
 453                                 'upload_date':  upload_date,
 454                                 'title':        video_title,
 455                                 'ext':          video_extension.decode('utf-8'),
 456                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 457                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 458                                 'description':  video_description,
 459                                 'player_url':   player_url,
 460                                 'subtitles':    video_subtitles
 461                         })
 462                 return results
 463
 464
 465 class MetacafeIE(InfoExtractor):
 466         """Information Extractor for metacafe.com."""
 467
 468         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 469         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 470         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 471         IE_NAME = u'metacafe'
 472
 473         def __init__(self, downloader=None):
 474                 InfoExtractor.__init__(self, downloader)
 475
 476         def report_disclaimer(self):
 477                 """Report disclaimer retrieval."""
 478                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 479
 480         def report_age_confirmation(self):
 481                 """Report attempt to confirm age."""
 482                 self._downloader.to_screen(u'[metacafe] Confirming age')
 483
 484         def report_download_webpage(self, video_id):
 485                 """Report webpage download."""
 486                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 487
 488         def report_extraction(self, video_id):
 489                 """Report information extraction."""
 490                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 491
 492         def _real_initialize(self):
 493                 # Retrieve disclaimer
 494                 request = urllib2.Request(self._DISCLAIMER)
 495                 try:
 496                         self.report_disclaimer()
 497                         disclaimer = urllib2.urlopen(request).read()
 498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 499                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 500                         return
 501
 502                 # Confirm age
 503                 disclaimer_form = {
 504                         'filters': '0',
 505                         'submit': "Continue - I'm over 18",
 506                         }
 507                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 508                 try:
 509                         self.report_age_confirmation()
 510                         disclaimer = urllib2.urlopen(request).read()
 511                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 512                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 513                         return
 514
 515         def _real_extract(self, url):
 516                 # Extract id and simplified title from URL
 517                 mobj = re.match(self._VALID_URL, url)
 518                 if mobj is None:
 519                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 520                         return
 521
 522                 video_id = mobj.group(1)
 523
 524                 # Check if video comes from YouTube
 525                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 526                 if mobj2 is not None:
 527                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 528                         return
 529
 530                 # Retrieve video webpage to extract further information
 531                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 532                 try:
 533                         self.report_download_webpage(video_id)
 534                         webpage = urllib2.urlopen(request).read()
 535                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 536                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 537                         return
 538
 539                 # Extract URL, uploader and title from webpage
 540                 self.report_extraction(video_id)
 541                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 542                 if mobj is not None:
 543                         mediaURL = urllib.unquote(mobj.group(1))
 544                         video_extension = mediaURL[-3:]
 545
 546                         # Extract gdaKey if available
 547                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 548                         if mobj is None:
 549                                 video_url = mediaURL
 550                         else:
 551                                 gdaKey = mobj.group(1)
 552                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 553                 else:
 554                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 555                         if mobj is None:
 556                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 557                                 return
 558                         vardict = parse_qs(mobj.group(1))
 559                         if 'mediaData' not in vardict:
 560                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 561                                 return
 562                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 563                         if mobj is None:
 564                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 565                                 return
 566                         mediaURL = mobj.group(1).replace('\\/', '/')
 567                         video_extension = mediaURL[-3:]
 568                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 569
 570                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 571                 if mobj is None:
 572                         self._downloader.trouble(u'ERROR: unable to extract title')
 573                         return
 574                 video_title = mobj.group(1).decode('utf-8')
 575
 576                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 577                 if mobj is None:
 578                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 579                         return
 580                 video_uploader = mobj.group(1)
 581
 582                 return [{
 583                         'id':           video_id.decode('utf-8'),
 584                         'url':          video_url.decode('utf-8'),
 585                         'uploader':     video_uploader.decode('utf-8'),
 586                         'upload_date':  u'NA',
 587                         'title':        video_title,
 588                         'ext':          video_extension.decode('utf-8'),
 589                         'format':       u'NA',
 590                         'player_url':   None,
 591                 }]
 592
 593
 594 class DailymotionIE(InfoExtractor):
 595         """Information Extractor for Dailymotion"""
 596
 597         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
 598         IE_NAME = u'dailymotion'
 599
 600         def __init__(self, downloader=None):
 601                 InfoExtractor.__init__(self, downloader)
 602
 603         def report_download_webpage(self, video_id):
 604                 """Report webpage download."""
 605                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 606
 607         def report_extraction(self, video_id):
 608                 """Report information extraction."""
 609                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 610
 611         def _real_extract(self, url):
 612                 # Extract id and simplified title from URL
 613                 mobj = re.match(self._VALID_URL, url)
 614                 if mobj is None:
 615                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 616                         return
 617
 618                 video_id = mobj.group(1)
 619
 620                 video_extension = 'flv'
 621
 622                 # Retrieve video webpage to extract further information
 623                 request = urllib2.Request(url)
 624                 request.add_header('Cookie', 'family_filter=off')
 625                 try:
 626                         self.report_download_webpage(video_id)
 627                         webpage = urllib2.urlopen(request).read()
 628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 629                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 630                         return
 631
 632                 # Extract URL, uploader and title from webpage
 633                 self.report_extraction(video_id)
 634                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
 635                 if mobj is None:
 636                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 637                         return
 638                 sequence = urllib.unquote(mobj.group(1))
 639                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
 640                 if mobj is None:
 641                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 642                         return
 643                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
 644
 645                 # if needed add http://www.dailymotion.com/ if relative URL
 646
 647                 video_url = mediaURL
 648
 649                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 650                 if mobj is None:
 651                         self._downloader.trouble(u'ERROR: unable to extract title')
 652                         return
 653                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 654
 655                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 656                 if mobj is None:
 657                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 658                         return
 659                 video_uploader = mobj.group(1)
 660
 661                 return [{
 662                         'id':           video_id.decode('utf-8'),
 663                         'url':          video_url.decode('utf-8'),
 664                         'uploader':     video_uploader.decode('utf-8'),
 665                         'upload_date':  u'NA',
 666                         'title':        video_title,
 667                         'ext':          video_extension.decode('utf-8'),
 668                         'format':       u'NA',
 669                         'player_url':   None,
 670                 }]
 671
 672
 673 class GoogleIE(InfoExtractor):
 674         """Information extractor for video.google.com."""
 675
 676         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 677         IE_NAME = u'video.google'
 678
 679         def __init__(self, downloader=None):
 680                 InfoExtractor.__init__(self, downloader)
 681
 682         def report_download_webpage(self, video_id):
 683                 """Report webpage download."""
 684                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 685
 686         def report_extraction(self, video_id):
 687                 """Report information extraction."""
 688                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 689
 690         def _real_extract(self, url):
 691                 # Extract id from URL
 692                 mobj = re.match(self._VALID_URL, url)
 693                 if mobj is None:
 694                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 695                         return
 696
 697                 video_id = mobj.group(1)
 698
 699                 video_extension = 'mp4'
 700
 701                 # Retrieve video webpage to extract further information
 702                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 703                 try:
 704                         self.report_download_webpage(video_id)
 705                         webpage = urllib2.urlopen(request).read()
 706                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 707                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 708                         return
 709
 710                 # Extract URL, uploader, and title from webpage
 711                 self.report_extraction(video_id)
 712                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 713                 if mobj is None:
 714                         video_extension = 'flv'
 715                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 716                 if mobj is None:
 717                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 718                         return
 719                 mediaURL = urllib.unquote(mobj.group(1))
 720                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 721                 mediaURL = mediaURL.replace('\\x26', '\x26')
 722
 723                 video_url = mediaURL
 724
 725                 mobj = re.search(r'<title>(.*)</title>', webpage)
 726                 if mobj is None:
 727                         self._downloader.trouble(u'ERROR: unable to extract title')
 728                         return
 729                 video_title = mobj.group(1).decode('utf-8')
 730
 731                 # Extract video description
 732                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 733                 if mobj is None:
 734                         self._downloader.trouble(u'ERROR: unable to extract video description')
 735                         return
 736                 video_description = mobj.group(1).decode('utf-8')
 737                 if not video_description:
 738                         video_description = 'No description available.'
 739
 740                 # Extract video thumbnail
 741                 if self._downloader.params.get('forcethumbnail', False):
 742                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 743                         try:
 744                                 webpage = urllib2.urlopen(request).read()
 745                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 746                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 747                                 return
 748                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 749                         if mobj is None:
 750                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 751                                 return
 752                         video_thumbnail = mobj.group(1)
 753                 else:   # we need something to pass to process_info
 754                         video_thumbnail = ''
 755
 756                 return [{
 757                         'id':           video_id.decode('utf-8'),
 758                         'url':          video_url.decode('utf-8'),
 759                         'uploader':     u'NA',
 760                         'upload_date':  u'NA',
 761                         'title':        video_title,
 762                         'ext':          video_extension.decode('utf-8'),
 763                         'format':       u'NA',
 764                         'player_url':   None,
 765                 }]
 766
 767
 768 class PhotobucketIE(InfoExtractor):
 769         """Information extractor for photobucket.com."""
 770
 771         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 772         IE_NAME = u'photobucket'
 773
 774         def __init__(self, downloader=None):
 775                 InfoExtractor.__init__(self, downloader)
 776
 777         def report_download_webpage(self, video_id):
 778                 """Report webpage download."""
 779                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 780
 781         def report_extraction(self, video_id):
 782                 """Report information extraction."""
 783                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 784
 785         def _real_extract(self, url):
 786                 # Extract id from URL
 787                 mobj = re.match(self._VALID_URL, url)
 788                 if mobj is None:
 789                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 790                         return
 791
 792                 video_id = mobj.group(1)
 793
 794                 video_extension = 'flv'
 795
 796                 # Retrieve video webpage to extract further information
 797                 request = urllib2.Request(url)
 798                 try:
 799                         self.report_download_webpage(video_id)
 800                         webpage = urllib2.urlopen(request).read()
 801                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 802                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 803                         return
 804
 805                 # Extract URL, uploader, and title from webpage
 806                 self.report_extraction(video_id)
 807                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 808                 if mobj is None:
 809                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 810                         return
 811                 mediaURL = urllib.unquote(mobj.group(1))
 812
 813                 video_url = mediaURL
 814
 815                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 816                 if mobj is None:
 817                         self._downloader.trouble(u'ERROR: unable to extract title')
 818                         return
 819                 video_title = mobj.group(1).decode('utf-8')
 820
 821                 video_uploader = mobj.group(2).decode('utf-8')
 822
 823                 return [{
 824                         'id':           video_id.decode('utf-8'),
 825                         'url':          video_url.decode('utf-8'),
 826                         'uploader':     video_uploader,
 827                         'upload_date':  u'NA',
 828                         'title':        video_title,
 829                         'ext':          video_extension.decode('utf-8'),
 830                         'format':       u'NA',
 831                         'player_url':   None,
 832                 }]
 833
 834
 835 class YahooIE(InfoExtractor):
 836         """Information extractor for video.yahoo.com."""
 837
 838         # _VALID_URL matches all Yahoo! Video URLs
 839         # _VPAGE_URL matches only the extractable '/watch/' URLs
 840         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 841         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 842         IE_NAME = u'video.yahoo'
 843
 844         def __init__(self, downloader=None):
 845                 InfoExtractor.__init__(self, downloader)
 846
 847         def report_download_webpage(self, video_id):
 848                 """Report webpage download."""
 849                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 850
 851         def report_extraction(self, video_id):
 852                 """Report information extraction."""
 853                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 854
 855         def _real_extract(self, url, new_video=True):
 856                 # Extract ID from URL
 857                 mobj = re.match(self._VALID_URL, url)
 858                 if mobj is None:
 859                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 860                         return
 861
 862                 video_id = mobj.group(2)
 863                 video_extension = 'flv'
 864
 865                 # Rewrite valid but non-extractable URLs as
 866                 # extractable English language /watch/ URLs
 867                 if re.match(self._VPAGE_URL, url) is None:
 868                         request = urllib2.Request(url)
 869                         try:
 870                                 webpage = urllib2.urlopen(request).read()
 871                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 872                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 873                                 return
 874
 875                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 876                         if mobj is None:
 877                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 878                                 return
 879                         yahoo_id = mobj.group(1)
 880
 881                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 882                         if mobj is None:
 883                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 884                                 return
 885                         yahoo_vid = mobj.group(1)
 886
 887                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 888                         return self._real_extract(url, new_video=False)
 889
 890                 # Retrieve video webpage to extract further information
 891                 request = urllib2.Request(url)
 892                 try:
 893                         self.report_download_webpage(video_id)
 894                         webpage = urllib2.urlopen(request).read()
 895                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 896                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 897                         return
 898
 899                 # Extract uploader and title from webpage
 900                 self.report_extraction(video_id)
 901                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 902                 if mobj is None:
 903                         self._downloader.trouble(u'ERROR: unable to extract video title')
 904                         return
 905                 video_title = mobj.group(1).decode('utf-8')
 906
 907                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 908                 if mobj is None:
 909                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 910                         return
 911                 video_uploader = mobj.group(1).decode('utf-8')
 912
 913                 # Extract video thumbnail
 914                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 915                 if mobj is None:
 916                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 917                         return
 918                 video_thumbnail = mobj.group(1).decode('utf-8')
 919
 920                 # Extract video description
 921                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 922                 if mobj is None:
 923                         self._downloader.trouble(u'ERROR: unable to extract video description')
 924                         return
 925                 video_description = mobj.group(1).decode('utf-8')
 926                 if not video_description:
 927                         video_description = 'No description available.'
 928
 929                 # Extract video height and width
 930                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 931                 if mobj is None:
 932                         self._downloader.trouble(u'ERROR: unable to extract video height')
 933                         return
 934                 yv_video_height = mobj.group(1)
 935
 936                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 937                 if mobj is None:
 938                         self._downloader.trouble(u'ERROR: unable to extract video width')
 939                         return
 940                 yv_video_width = mobj.group(1)
 941
 942                 # Retrieve video playlist to extract media URL
 943                 # I'm not completely sure what all these options are, but we
 944                 # seem to need most of them, otherwise the server sends a 401.
 945                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 946                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 947                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 948                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 949                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 950                 try:
 951                         self.report_download_webpage(video_id)
 952                         webpage = urllib2.urlopen(request).read()
 953                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 954                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 955                         return
 956
 957                 # Extract media URL from playlist XML
 958                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 959                 if mobj is None:
 960                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 961                         return
 962                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 963                 video_url = unescapeHTML(video_url)
 964
 965                 return [{
 966                         'id':           video_id.decode('utf-8'),
 967                         'url':          video_url,
 968                         'uploader':     video_uploader,
 969                         'upload_date':  u'NA',
 970                         'title':        video_title,
 971                         'ext':          video_extension.decode('utf-8'),
 972                         'thumbnail':    video_thumbnail.decode('utf-8'),
 973                         'description':  video_description,
 974                         'thumbnail':    video_thumbnail,
 975                         'player_url':   None,
 976                 }]
 977
 978
 979 class VimeoIE(InfoExtractor):
 980         """Information extractor for vimeo.com."""
 981
 982         # _VALID_URL matches Vimeo URLs
 983         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
 984         IE_NAME = u'vimeo'
 985
 986         def __init__(self, downloader=None):
 987                 InfoExtractor.__init__(self, downloader)
 988
 989         def report_download_webpage(self, video_id):
 990                 """Report webpage download."""
 991                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 992
 993         def report_extraction(self, video_id):
 994                 """Report information extraction."""
 995                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 996
 997         def _real_extract(self, url, new_video=True):
 998                 # Extract ID from URL
 999                 mobj = re.match(self._VALID_URL, url)
1000                 if mobj is None:
1001                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1002                         return
1003
1004                 video_id = mobj.group(1)
1005
1006                 # Retrieve video webpage to extract further information
1007                 request = urllib2.Request(url, None, std_headers)
1008                 try:
1009                         self.report_download_webpage(video_id)
1010                         webpage = urllib2.urlopen(request).read()
1011                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1012                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1013                         return
1014
1015                 # Now we begin extracting as much information as we can from what we
1016                 # retrieved. First we extract the information common to all extractors,
1017                 # and latter we extract those that are Vimeo specific.
1018                 self.report_extraction(video_id)
1019
1020                 # Extract the config JSON
1021                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1022                 try:
1023                         config = json.loads(config)
1024                 except:
1025                         self._downloader.trouble(u'ERROR: unable to extract info section')
1026                         return
1027
1028                 # Extract title
1029                 video_title = config["video"]["title"]
1030
1031                 # Extract uploader
1032                 video_uploader = config["video"]["owner"]["name"]
1033
1034                 # Extract video thumbnail
1035                 video_thumbnail = config["video"]["thumbnail"]
1036
1037                 # Extract video description
1038                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1039                 if video_description: video_description = clean_html(video_description)
1040                 else: video_description = ''
1041
1042                 # Extract upload date
1043                 video_upload_date = u'NA'
1044                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1045                 if mobj is not None:
1046                         video_upload_date = mobj.group(1)
1047
1048                 # Vimeo specific: extract request signature and timestamp
1049                 sig = config['request']['signature']
1050                 timestamp = config['request']['timestamp']
1051
1052                 # Vimeo specific: extract video codec and quality information
1053                 # TODO bind to format param
1054                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1055                 for codec in codecs:
1056                         if codec[0] in config["video"]["files"]:
1057                                 video_codec = codec[0]
1058                                 video_extension = codec[1]
1059                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1060                                 else: quality = 'sd'
1061                                 break
1062                 else:
1063                         self._downloader.trouble(u'ERROR: no known codec found')
1064                         return
1065
1066                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1067                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1068
1069                 return [{
1070                         'id':           video_id,
1071                         'url':          video_url,
1072                         'uploader':     video_uploader,
1073                         'upload_date':  video_upload_date,
1074                         'title':        video_title,
1075                         'ext':          video_extension,
1076                         'thumbnail':    video_thumbnail,
1077                         'description':  video_description,
1078                         'player_url':   None,
1079                 }]
1080
1081
1082 class GenericIE(InfoExtractor):
1083         """Generic last-resort information extractor."""
1084
1085         _VALID_URL = r'.*'
1086         IE_NAME = u'generic'
1087
1088         def __init__(self, downloader=None):
1089                 InfoExtractor.__init__(self, downloader)
1090
1091         def report_download_webpage(self, video_id):
1092                 """Report webpage download."""
1093                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1094                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1095
1096         def report_extraction(self, video_id):
1097                 """Report information extraction."""
1098                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1099
1100         def report_following_redirect(self, new_url):
1101                 """Report information extraction."""
1102                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1103
1104         def _test_redirect(self, url):
1105                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1106                 class HeadRequest(urllib2.Request):
1107                         def get_method(self):
1108                                 return "HEAD"
1109
1110                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1111                         """
1112                         Subclass the HTTPRedirectHandler to make it use our
1113                         HeadRequest also on the redirected URL
1114                         """
1115                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1116                                 if code in (301, 302, 303, 307):
1117                                         newurl = newurl.replace(' ', '%20')
1118                                         newheaders = dict((k,v) for k,v in req.headers.items()
1119                                                                           if k.lower() not in ("content-length", "content-type"))
1120                                         return HeadRequest(newurl,
1121                                                                            headers=newheaders,
1122                                                                            origin_req_host=req.get_origin_req_host(),
1123                                                                            unverifiable=True)
1124                                 else:
1125                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1126
1127                 class HTTPMethodFallback(urllib2.BaseHandler):
1128                         """
1129                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1130                         """
1131                         def http_error_405(self, req, fp, code, msg, headers):
1132                                 fp.read()
1133                                 fp.close()
1134
1135                                 newheaders = dict((k,v) for k,v in req.headers.items()
1136                                                                   if k.lower() not in ("content-length", "content-type"))
1137                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1138                                                                                                  headers=newheaders,
1139                                                                                                  origin_req_host=req.get_origin_req_host(),
1140                                                                                                  unverifiable=True))
1141
1142                 # Build our opener
1143                 opener = urllib2.OpenerDirector()
1144                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1145                                                 HTTPMethodFallback, HEADRedirectHandler,
1146                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1147                         opener.add_handler(handler())
1148
1149                 response = opener.open(HeadRequest(url))
1150                 new_url = response.geturl()
1151
1152                 if url == new_url: return False
1153
1154                 self.report_following_redirect(new_url)
1155                 self._downloader.download([new_url])
1156                 return True
1157
1158         def _real_extract(self, url):
1159                 if self._test_redirect(url): return
1160
1161                 video_id = url.split('/')[-1]
1162                 request = urllib2.Request(url)
1163                 try:
1164                         self.report_download_webpage(video_id)
1165                         webpage = urllib2.urlopen(request).read()
1166                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1168                         return
1169                 except ValueError, err:
1170                         # since this is the last-resort InfoExtractor, if
1171                         # this error is thrown, it'll be thrown here
1172                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173                         return
1174
1175                 self.report_extraction(video_id)
1176                 # Start with something easy: JW Player in SWFObject
1177                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1178                 if mobj is None:
1179                         # Broaden the search a little bit
1180                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183                         return
1184
1185                 # It's possible that one of the regexes
1186                 # matched, but returned an empty group:
1187                 if mobj.group(1) is None:
1188                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189                         return
1190
1191                 video_url = urllib.unquote(mobj.group(1))
1192                 video_id = os.path.basename(video_url)
1193
1194                 # here's a fun little line of code for you:
1195                 video_extension = os.path.splitext(video_id)[1][1:]
1196                 video_id = os.path.splitext(video_id)[0]
1197
1198                 # it's tempting to parse this further, but you would
1199                 # have to take into account all the variations like
1200                 #   Video Title - Site Name
1201                 #   Site Name | Video Title
1202                 #   Video Title - Tagline | Site Name
1203                 # and so on and so forth; it's just not practical
1204                 mobj = re.search(r'<title>(.*)</title>', webpage)
1205                 if mobj is None:
1206                         self._downloader.trouble(u'ERROR: unable to extract title')
1207                         return
1208                 video_title = mobj.group(1).decode('utf-8')
1209
1210                 # video uploader is domain name
1211                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: unable to extract title')
1214                         return
1215                 video_uploader = mobj.group(1).decode('utf-8')
1216
1217                 return [{
1218                         'id':           video_id.decode('utf-8'),
1219                         'url':          video_url.decode('utf-8'),
1220                         'uploader':     video_uploader,
1221                         'upload_date':  u'NA',
1222                         'title':        video_title,
1223                         'ext':          video_extension.decode('utf-8'),
1224                         'format':       u'NA',
1225                         'player_url':   None,
1226                 }]
1227
1228
1229 class YoutubeSearchIE(InfoExtractor):
1230         """Information Extractor for YouTube search queries."""
1231         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1232         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1233         _max_youtube_results = 1000
1234         IE_NAME = u'youtube:search'
1235
1236         def __init__(self, downloader=None):
1237                 InfoExtractor.__init__(self, downloader)
1238
1239         def report_download_page(self, query, pagenum):
1240                 """Report attempt to download search page with given number."""
1241                 query = query.decode(preferredencoding())
1242                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1243
1244         def _real_extract(self, query):
1245                 mobj = re.match(self._VALID_URL, query)
1246                 if mobj is None:
1247                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1248                         return
1249
1250                 prefix, query = query.split(':')
1251                 prefix = prefix[8:]
1252                 query = query.encode('utf-8')
1253                 if prefix == '':
1254                         self._download_n_results(query, 1)
1255                         return
1256                 elif prefix == 'all':
1257                         self._download_n_results(query, self._max_youtube_results)
1258                         return
1259                 else:
1260                         try:
1261                                 n = long(prefix)
1262                                 if n <= 0:
1263                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1264                                         return
1265                                 elif n > self._max_youtube_results:
1266                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1267                                         n = self._max_youtube_results
1268                                 self._download_n_results(query, n)
1269                                 return
1270                         except ValueError: # parsing prefix as integer fails
1271                                 self._download_n_results(query, 1)
1272                                 return
1273
1274         def _download_n_results(self, query, n):
1275                 """Downloads a specified number of results for a query"""
1276
1277                 video_ids = []
1278                 pagenum = 0
1279                 limit = n
1280
1281                 while (50 * pagenum) < limit:
1282                         self.report_download_page(query, pagenum+1)
1283                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1284                         request = urllib2.Request(result_url)
1285                         try:
1286                                 data = urllib2.urlopen(request).read()
1287                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1288                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1289                                 return
1290                         api_response = json.loads(data)['data']
1291
1292                         new_ids = list(video['id'] for video in api_response['items'])
1293                         video_ids += new_ids
1294
1295                         limit = min(n, api_response['totalItems'])
1296                         pagenum += 1
1297
1298                 if len(video_ids) > n:
1299                         video_ids = video_ids[:n]
1300                 for id in video_ids:
1301                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1302                 return
1303
1304
1305 class GoogleSearchIE(InfoExtractor):
1306         """Information Extractor for Google Video search queries."""
1307         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1308         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1309         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1310         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1311         _max_google_results = 1000
1312         IE_NAME = u'video.google:search'
1313
1314         def __init__(self, downloader=None):
1315                 InfoExtractor.__init__(self, downloader)
1316
1317         def report_download_page(self, query, pagenum):
1318                 """Report attempt to download playlist page with given number."""
1319                 query = query.decode(preferredencoding())
1320                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1321
1322         def _real_extract(self, query):
1323                 mobj = re.match(self._VALID_URL, query)
1324                 if mobj is None:
1325                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1326                         return
1327
1328                 prefix, query = query.split(':')
1329                 prefix = prefix[8:]
1330                 query = query.encode('utf-8')
1331                 if prefix == '':
1332                         self._download_n_results(query, 1)
1333                         return
1334                 elif prefix == 'all':
1335                         self._download_n_results(query, self._max_google_results)
1336                         return
1337                 else:
1338                         try:
1339                                 n = long(prefix)
1340                                 if n <= 0:
1341                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1342                                         return
1343                                 elif n > self._max_google_results:
1344                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1345                                         n = self._max_google_results
1346                                 self._download_n_results(query, n)
1347                                 return
1348                         except ValueError: # parsing prefix as integer fails
1349                                 self._download_n_results(query, 1)
1350                                 return
1351
1352         def _download_n_results(self, query, n):
1353                 """Downloads a specified number of results for a query"""
1354
1355                 video_ids = []
1356                 pagenum = 0
1357
1358                 while True:
1359                         self.report_download_page(query, pagenum)
1360                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1361                         request = urllib2.Request(result_url)
1362                         try:
1363                                 page = urllib2.urlopen(request).read()
1364                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1365                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1366                                 return
1367
1368                         # Extract video identifiers
1369                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1370                                 video_id = mobj.group(1)
1371                                 if video_id not in video_ids:
1372                                         video_ids.append(video_id)
1373                                         if len(video_ids) == n:
1374                                                 # Specified n videos reached
1375                                                 for id in video_ids:
1376                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1377                                                 return
1378
1379                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1380                                 for id in video_ids:
1381                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1382                                 return
1383
1384                         pagenum = pagenum + 1
1385
1386
1387 class YahooSearchIE(InfoExtractor):
1388         """Information Extractor for Yahoo! Video search queries."""
1389         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1390         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1391         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1392         _MORE_PAGES_INDICATOR = r'\s*Next'
1393         _max_yahoo_results = 1000
1394         IE_NAME = u'video.yahoo:search'
1395
1396         def __init__(self, downloader=None):
1397                 InfoExtractor.__init__(self, downloader)
1398
1399         def report_download_page(self, query, pagenum):
1400                 """Report attempt to download playlist page with given number."""
1401                 query = query.decode(preferredencoding())
1402                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1403
1404         def _real_extract(self, query):
1405                 mobj = re.match(self._VALID_URL, query)
1406                 if mobj is None:
1407                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1408                         return
1409
1410                 prefix, query = query.split(':')
1411                 prefix = prefix[8:]
1412                 query = query.encode('utf-8')
1413                 if prefix == '':
1414                         self._download_n_results(query, 1)
1415                         return
1416                 elif prefix == 'all':
1417                         self._download_n_results(query, self._max_yahoo_results)
1418                         return
1419                 else:
1420                         try:
1421                                 n = long(prefix)
1422                                 if n <= 0:
1423                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1424                                         return
1425                                 elif n > self._max_yahoo_results:
1426                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1427                                         n = self._max_yahoo_results
1428                                 self._download_n_results(query, n)
1429                                 return
1430                         except ValueError: # parsing prefix as integer fails
1431                                 self._download_n_results(query, 1)
1432                                 return
1433
1434         def _download_n_results(self, query, n):
1435                 """Downloads a specified number of results for a query"""
1436
1437                 video_ids = []
1438                 already_seen = set()
1439                 pagenum = 1
1440
1441                 while True:
1442                         self.report_download_page(query, pagenum)
1443                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1444                         request = urllib2.Request(result_url)
1445                         try:
1446                                 page = urllib2.urlopen(request).read()
1447                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1448                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1449                                 return
1450
1451                         # Extract video identifiers
1452                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1453                                 video_id = mobj.group(1)
1454                                 if video_id not in already_seen:
1455                                         video_ids.append(video_id)
1456                                         already_seen.add(video_id)
1457                                         if len(video_ids) == n:
1458                                                 # Specified n videos reached
1459                                                 for id in video_ids:
1460                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1461                                                 return
1462
1463                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1464                                 for id in video_ids:
1465                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1466                                 return
1467
1468                         pagenum = pagenum + 1
1469
1470
1471 class YoutubePlaylistIE(InfoExtractor):
1472         """Information Extractor for YouTube playlists."""
1473
1474         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1475         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1476         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1477         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1478         IE_NAME = u'youtube:playlist'
1479
1480         def __init__(self, downloader=None):
1481                 InfoExtractor.__init__(self, downloader)
1482
1483         def report_download_page(self, playlist_id, pagenum):
1484                 """Report attempt to download playlist page with given number."""
1485                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1486
1487         def _real_extract(self, url):
1488                 # Extract playlist id
1489                 mobj = re.match(self._VALID_URL, url)
1490                 if mobj is None:
1491                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1492                         return
1493
1494                 # Single video case
1495                 if mobj.group(3) is not None:
1496                         self._downloader.download([mobj.group(3)])
1497                         return
1498
1499                 # Download playlist pages
1500                 # prefix is 'p' as default for playlists but there are other types that need extra care
1501                 playlist_prefix = mobj.group(1)
1502                 if playlist_prefix == 'a':
1503                         playlist_access = 'artist'
1504                 else:
1505                         playlist_prefix = 'p'
1506                         playlist_access = 'view_play_list'
1507                 playlist_id = mobj.group(2)
1508                 video_ids = []
1509                 pagenum = 1
1510
1511                 while True:
1512                         self.report_download_page(playlist_id, pagenum)
1513                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1514                         request = urllib2.Request(url)
1515                         try:
1516                                 page = urllib2.urlopen(request).read()
1517                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1519                                 return
1520
1521                         # Extract video identifiers
1522                         ids_in_page = []
1523                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1524                                 if mobj.group(1) not in ids_in_page:
1525                                         ids_in_page.append(mobj.group(1))
1526                         video_ids.extend(ids_in_page)
1527
1528                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1529                                 break
1530                         pagenum = pagenum + 1
1531
1532                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1533                 playlistend = self._downloader.params.get('playlistend', -1)
1534                 if playlistend == -1:
1535                         video_ids = video_ids[playliststart:]
1536                 else:
1537                         video_ids = video_ids[playliststart:playlistend]
1538
1539                 for id in video_ids:
1540                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1541                 return
1542
1543
1544 class YoutubeUserIE(InfoExtractor):
1545         """Information Extractor for YouTube users."""
1546
1547         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1548         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1549         _GDATA_PAGE_SIZE = 50
1550         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1551         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1552         IE_NAME = u'youtube:user'
1553
1554         def __init__(self, downloader=None):
1555                 InfoExtractor.__init__(self, downloader)
1556
1557         def report_download_page(self, username, start_index):
1558                 """Report attempt to download user page."""
1559                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1560                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1561
1562         def _real_extract(self, url):
1563                 # Extract username
1564                 mobj = re.match(self._VALID_URL, url)
1565                 if mobj is None:
1566                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1567                         return
1568
1569                 username = mobj.group(1)
1570
1571                 # Download video ids using YouTube Data API. Result size per
1572                 # query is limited (currently to 50 videos) so we need to query
1573                 # page by page until there are no video ids - it means we got
1574                 # all of them.
1575
1576                 video_ids = []
1577                 pagenum = 0
1578
1579                 while True:
1580                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1581                         self.report_download_page(username, start_index)
1582
1583                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1584
1585                         try:
1586                                 page = urllib2.urlopen(request).read()
1587                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1588                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1589                                 return
1590
1591                         # Extract video identifiers
1592                         ids_in_page = []
1593
1594                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595                                 if mobj.group(1) not in ids_in_page:
1596                                         ids_in_page.append(mobj.group(1))
1597
1598                         video_ids.extend(ids_in_page)
1599
1600                         # A little optimization - if current page is not
1601                         # "full", ie. does not contain PAGE_SIZE video ids then
1602                         # we can assume that this page is the last one - there
1603                         # are no more ids on further pages - no need to query
1604                         # again.
1605
1606                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1607                                 break
1608
1609                         pagenum += 1
1610
1611                 all_ids_count = len(video_ids)
1612                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1613                 playlistend = self._downloader.params.get('playlistend', -1)
1614
1615                 if playlistend == -1:
1616                         video_ids = video_ids[playliststart:]
1617                 else:
1618                         video_ids = video_ids[playliststart:playlistend]
1619
1620                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1621                                 (username, all_ids_count, len(video_ids)))
1622
1623                 for video_id in video_ids:
1624                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1625
1626
1627 class BlipTVUserIE(InfoExtractor):
1628         """Information Extractor for blip.tv users."""
1629
1630         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1631         _PAGE_SIZE = 12
1632         IE_NAME = u'blip.tv:user'
1633
1634         def __init__(self, downloader=None):
1635                 InfoExtractor.__init__(self, downloader)
1636
1637         def report_download_page(self, username, pagenum):
1638                 """Report attempt to download user page."""
1639                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1640                                 (self.IE_NAME, username, pagenum))
1641
1642         def _real_extract(self, url):
1643                 # Extract username
1644                 mobj = re.match(self._VALID_URL, url)
1645                 if mobj is None:
1646                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1647                         return
1648
1649                 username = mobj.group(1)
1650
1651                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1652
1653                 request = urllib2.Request(url)
1654
1655                 try:
1656                         page = urllib2.urlopen(request).read().decode('utf-8')
1657                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1658                         page_base = page_base % mobj.group(1)
1659                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1660                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1661                         return
1662
1663
1664                 # Download video ids using BlipTV Ajax calls. Result size per
1665                 # query is limited (currently to 12 videos) so we need to query
1666                 # page by page until there are no video ids - it means we got
1667                 # all of them.
1668
1669                 video_ids = []
1670                 pagenum = 1
1671
1672                 while True:
1673                         self.report_download_page(username, pagenum)
1674
1675                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1676
1677                         try:
1678                                 page = urllib2.urlopen(request).read().decode('utf-8')
1679                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1681                                 return
1682
1683                         # Extract video identifiers
1684                         ids_in_page = []
1685
1686                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1687                                 if mobj.group(1) not in ids_in_page:
1688                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1689
1690                         video_ids.extend(ids_in_page)
1691
1692                         # A little optimization - if current page is not
1693                         # "full", ie. does not contain PAGE_SIZE video ids then
1694                         # we can assume that this page is the last one - there
1695                         # are no more ids on further pages - no need to query
1696                         # again.
1697
1698                         if len(ids_in_page) < self._PAGE_SIZE:
1699                                 break
1700
1701                         pagenum += 1
1702
1703                 all_ids_count = len(video_ids)
1704                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1705                 playlistend = self._downloader.params.get('playlistend', -1)
1706
1707                 if playlistend == -1:
1708                         video_ids = video_ids[playliststart:]
1709                 else:
1710                         video_ids = video_ids[playliststart:playlistend]
1711
1712                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1713                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1714
1715                 for video_id in video_ids:
1716                         self._downloader.download([u'http://blip.tv/'+video_id])
1717
1718
1719 class DepositFilesIE(InfoExtractor):
1720         """Information extractor for depositfiles.com"""
1721
1722         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1723         IE_NAME = u'DepositFiles'
1724
1725         def __init__(self, downloader=None):
1726                 InfoExtractor.__init__(self, downloader)
1727
1728         def report_download_webpage(self, file_id):
1729                 """Report webpage download."""
1730                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1731
1732         def report_extraction(self, file_id):
1733                 """Report information extraction."""
1734                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1735
1736         def _real_extract(self, url):
1737                 file_id = url.split('/')[-1]
1738                 # Rebuild url in english locale
1739                 url = 'http://depositfiles.com/en/files/' + file_id
1740
1741                 # Retrieve file webpage with 'Free download' button pressed
1742                 free_download_indication = { 'gateway_result' : '1' }
1743                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1744                 try:
1745                         self.report_download_webpage(file_id)
1746                         webpage = urllib2.urlopen(request).read()
1747                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1748                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1749                         return
1750
1751                 # Search for the real file URL
1752                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1753                 if (mobj is None) or (mobj.group(1) is None):
1754                         # Try to figure out reason of the error.
1755                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1756                         if (mobj is not None) and (mobj.group(1) is not None):
1757                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1758                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1759                         else:
1760                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1761                         return
1762
1763                 file_url = mobj.group(1)
1764                 file_extension = os.path.splitext(file_url)[1][1:]
1765
1766                 # Search for file title
1767                 mobj = re.search(r'<b title="(.*?)">', webpage)
1768                 if mobj is None:
1769                         self._downloader.trouble(u'ERROR: unable to extract title')
1770                         return
1771                 file_title = mobj.group(1).decode('utf-8')
1772
1773                 return [{
1774                         'id':           file_id.decode('utf-8'),
1775                         'url':          file_url.decode('utf-8'),
1776                         'uploader':     u'NA',
1777                         'upload_date':  u'NA',
1778                         'title':        file_title,
1779                         'ext':          file_extension.decode('utf-8'),
1780                         'format':       u'NA',
1781                         'player_url':   None,
1782                 }]
1783
1784
1785 class FacebookIE(InfoExtractor):
1786         """Information Extractor for Facebook"""
1787
1788         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1789         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1790         _NETRC_MACHINE = 'facebook'
1791         _available_formats = ['video', 'highqual', 'lowqual']
1792         _video_extensions = {
1793                 'video': 'mp4',
1794                 'highqual': 'mp4',
1795                 'lowqual': 'mp4',
1796         }
1797         IE_NAME = u'facebook'
1798
1799         def __init__(self, downloader=None):
1800                 InfoExtractor.__init__(self, downloader)
1801
1802         def _reporter(self, message):
1803                 """Add header and report message."""
1804                 self._downloader.to_screen(u'[facebook] %s' % message)
1805
1806         def report_login(self):
1807                 """Report attempt to log in."""
1808                 self._reporter(u'Logging in')
1809
1810         def report_video_webpage_download(self, video_id):
1811                 """Report attempt to download video webpage."""
1812                 self._reporter(u'%s: Downloading video webpage' % video_id)
1813
1814         def report_information_extraction(self, video_id):
1815                 """Report attempt to extract video information."""
1816                 self._reporter(u'%s: Extracting video information' % video_id)
1817
1818         def _parse_page(self, video_webpage):
1819                 """Extract video information from page"""
1820                 # General data
1821                 data = {'title': r'\("video_title", "(.*?)"\)',
1822                         'description': r'<div class="datawrap">(.*?)</div>',
1823                         'owner': r'\("video_owner_name", "(.*?)"\)',
1824                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1825                         }
1826                 video_info = {}
1827                 for piece in data.keys():
1828                         mobj = re.search(data[piece], video_webpage)
1829                         if mobj is not None:
1830                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1831
1832                 # Video urls
1833                 video_urls = {}
1834                 for fmt in self._available_formats:
1835                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1836                         if mobj is not None:
1837                                 # URL is in a Javascript segment inside an escaped Unicode format within
1838                                 # the generally utf-8 page
1839                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1840                 video_info['video_urls'] = video_urls
1841
1842                 return video_info
1843
1844         def _real_initialize(self):
1845                 if self._downloader is None:
1846                         return
1847
1848                 useremail = None
1849                 password = None
1850                 downloader_params = self._downloader.params
1851
1852                 # Attempt to use provided username and password or .netrc data
1853                 if downloader_params.get('username', None) is not None:
1854                         useremail = downloader_params['username']
1855                         password = downloader_params['password']
1856                 elif downloader_params.get('usenetrc', False):
1857                         try:
1858                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1859                                 if info is not None:
1860                                         useremail = info[0]
1861                                         password = info[2]
1862                                 else:
1863                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1864                         except (IOError, netrc.NetrcParseError), err:
1865                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1866                                 return
1867
1868                 if useremail is None:
1869                         return
1870
1871                 # Log in
1872                 login_form = {
1873                         'email': useremail,
1874                         'pass': password,
1875                         'login': 'Log+In'
1876                         }
1877                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1878                 try:
1879                         self.report_login()
1880                         login_results = urllib2.urlopen(request).read()
1881                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1882                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1883                                 return
1884                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1885                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1886                         return
1887
1888         def _real_extract(self, url):
1889                 mobj = re.match(self._VALID_URL, url)
1890                 if mobj is None:
1891                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1892                         return
1893                 video_id = mobj.group('ID')
1894
1895                 # Get video webpage
1896                 self.report_video_webpage_download(video_id)
1897                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1898                 try:
1899                         page = urllib2.urlopen(request)
1900                         video_webpage = page.read()
1901                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1902                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1903                         return
1904
1905                 # Start extracting information
1906                 self.report_information_extraction(video_id)
1907
1908                 # Extract information
1909                 video_info = self._parse_page(video_webpage)
1910
1911                 # uploader
1912                 if 'owner' not in video_info:
1913                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1914                         return
1915                 video_uploader = video_info['owner']
1916
1917                 # title
1918                 if 'title' not in video_info:
1919                         self._downloader.trouble(u'ERROR: unable to extract video title')
1920                         return
1921                 video_title = video_info['title']
1922                 video_title = video_title.decode('utf-8')
1923
1924                 # thumbnail image
1925                 if 'thumbnail' not in video_info:
1926                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1927                         video_thumbnail = ''
1928                 else:
1929                         video_thumbnail = video_info['thumbnail']
1930
1931                 # upload date
1932                 upload_date = u'NA'
1933                 if 'upload_date' in video_info:
1934                         upload_time = video_info['upload_date']
1935                         timetuple = email.utils.parsedate_tz(upload_time)
1936                         if timetuple is not None:
1937                                 try:
1938                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1939                                 except:
1940                                         pass
1941
1942                 # description
1943                 video_description = video_info.get('description', 'No description available.')
1944
1945                 url_map = video_info['video_urls']
1946                 if len(url_map.keys()) > 0:
1947                         # Decide which formats to download
1948                         req_format = self._downloader.params.get('format', None)
1949                         format_limit = self._downloader.params.get('format_limit', None)
1950
1951                         if format_limit is not None and format_limit in self._available_formats:
1952                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1953                         else:
1954                                 format_list = self._available_formats
1955                         existing_formats = [x for x in format_list if x in url_map]
1956                         if len(existing_formats) == 0:
1957                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1958                                 return
1959                         if req_format is None:
1960                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1961                         elif req_format == 'worst':
1962                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1963                         elif req_format == '-1':
1964                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1965                         else:
1966                                 # Specific format
1967                                 if req_format not in url_map:
1968                                         self._downloader.trouble(u'ERROR: requested format not available')
1969                                         return
1970                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1971
1972                 results = []
1973                 for format_param, video_real_url in video_url_list:
1974                         # Extension
1975                         video_extension = self._video_extensions.get(format_param, 'mp4')
1976
1977                         results.append({
1978                                 'id':           video_id.decode('utf-8'),
1979                                 'url':          video_real_url.decode('utf-8'),
1980                                 'uploader':     video_uploader.decode('utf-8'),
1981                                 'upload_date':  upload_date,
1982                                 'title':        video_title,
1983                                 'ext':          video_extension.decode('utf-8'),
1984                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1985                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1986                                 'description':  video_description.decode('utf-8'),
1987                                 'player_url':   None,
1988                         })
1989                 return results
1990
1991 class BlipTVIE(InfoExtractor):
1992         """Information extractor for blip.tv"""
1993
1994         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1995         _URL_EXT = r'^.*\.([a-z0-9]+)$'
1996         IE_NAME = u'blip.tv'
1997
1998         def report_extraction(self, file_id):
1999                 """Report information extraction."""
2000                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2001
2002         def report_direct_download(self, title):
2003                 """Report information extraction."""
2004                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2005
2006         def _real_extract(self, url):
2007                 mobj = re.match(self._VALID_URL, url)
2008                 if mobj is None:
2009                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2010                         return
2011
2012                 if '?' in url:
2013                         cchar = '&'
2014                 else:
2015                         cchar = '?'
2016                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2017                 request = urllib2.Request(json_url.encode('utf-8'))
2018                 self.report_extraction(mobj.group(1))
2019                 info = None
2020                 try:
2021                         urlh = urllib2.urlopen(request)
2022                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2023                                 basename = url.split('/')[-1]
2024                                 title,ext = os.path.splitext(basename)
2025                                 title = title.decode('UTF-8')
2026                                 ext = ext.replace('.', '')
2027                                 self.report_direct_download(title)
2028                                 info = {
2029                                         'id': title,
2030                                         'url': url,
2031                                         'title': title,
2032                                         'ext': ext,
2033                                         'urlhandle': urlh
2034                                 }
2035                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2036                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2037                         return
2038                 if info is None: # Regular URL
2039                         try:
2040                                 json_code = urlh.read()
2041                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2042                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2043                                 return
2044
2045                         try:
2046                                 json_data = json.loads(json_code)
2047                                 if 'Post' in json_data:
2048                                         data = json_data['Post']
2049                                 else:
2050                                         data = json_data
2051
2052                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2053                                 video_url = data['media']['url']
2054                                 umobj = re.match(self._URL_EXT, video_url)
2055                                 if umobj is None:
2056                                         raise ValueError('Can not determine filename extension')
2057                                 ext = umobj.group(1)
2058
2059                                 info = {
2060                                         'id': data['item_id'],
2061                                         'url': video_url,
2062                                         'uploader': data['display_name'],
2063                                         'upload_date': upload_date,
2064                                         'title': data['title'],
2065                                         'ext': ext,
2066                                         'format': data['media']['mimeType'],
2067                                         'thumbnail': data['thumbnailUrl'],
2068                                         'description': data['description'],
2069                                         'player_url': data['embedUrl']
2070                                 }
2071                         except (ValueError,KeyError), err:
2072                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2073                                 return
2074
2075                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2076                 return [info]
2077
2078
2079 class MyVideoIE(InfoExtractor):
2080         """Information Extractor for myvideo.de."""
2081
2082         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2083         IE_NAME = u'myvideo'
2084
2085         def __init__(self, downloader=None):
2086                 InfoExtractor.__init__(self, downloader)
2087
2088         def report_download_webpage(self, video_id):
2089                 """Report webpage download."""
2090                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2091
2092         def report_extraction(self, video_id):
2093                 """Report information extraction."""
2094                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2095
2096         def _real_extract(self,url):
2097                 mobj = re.match(self._VALID_URL, url)
2098                 if mobj is None:
2099                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2100                         return
2101
2102                 video_id = mobj.group(1)
2103
2104                 # Get video webpage
2105                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2106                 try:
2107                         self.report_download_webpage(video_id)
2108                         webpage = urllib2.urlopen(request).read()
2109                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2110                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2111                         return
2112
2113                 self.report_extraction(video_id)
2114                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2115                                  webpage)
2116                 if mobj is None:
2117                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2118                         return
2119                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2120
2121                 mobj = re.search('<title>([^<]+)</title>', webpage)
2122                 if mobj is None:
2123                         self._downloader.trouble(u'ERROR: unable to extract title')
2124                         return
2125
2126                 video_title = mobj.group(1)
2127
2128                 return [{
2129                         'id':           video_id,
2130                         'url':          video_url,
2131                         'uploader':     u'NA',
2132                         'upload_date':  u'NA',
2133                         'title':        video_title,
2134                         'ext':          u'flv',
2135                         'format':       u'NA',
2136                         'player_url':   None,
2137                 }]
2138
2139 class ComedyCentralIE(InfoExtractor):
2140         """Information extractor for The Daily Show and Colbert Report """
2141
2142         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2143         IE_NAME = u'comedycentral'
2144
2145         def report_extraction(self, episode_id):
2146                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2147
2148         def report_config_download(self, episode_id):
2149                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2150
2151         def report_index_download(self, episode_id):
2152                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2153
2154         def report_player_url(self, episode_id):
2155                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2156
2157         def _real_extract(self, url):
2158                 mobj = re.match(self._VALID_URL, url)
2159                 if mobj is None:
2160                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2161                         return
2162
2163                 if mobj.group('shortname'):
2164                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2165                                 url = u'http://www.thedailyshow.com/full-episodes/'
2166                         else:
2167                                 url = u'http://www.colbertnation.com/full-episodes/'
2168                         mobj = re.match(self._VALID_URL, url)
2169                         assert mobj is not None
2170
2171                 dlNewest = not mobj.group('episode')
2172                 if dlNewest:
2173                         epTitle = mobj.group('showname')
2174                 else:
2175                         epTitle = mobj.group('episode')
2176
2177                 req = urllib2.Request(url)
2178                 self.report_extraction(epTitle)
2179                 try:
2180                         htmlHandle = urllib2.urlopen(req)
2181                         html = htmlHandle.read()
2182                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2183                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2184                         return
2185                 if dlNewest:
2186                         url = htmlHandle.geturl()
2187                         mobj = re.match(self._VALID_URL, url)
2188                         if mobj is None:
2189                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2190                                 return
2191                         if mobj.group('episode') == '':
2192                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2193                                 return
2194                         epTitle = mobj.group('episode')
2195
2196                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2197                 if len(mMovieParams) == 0:
2198                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2199                         return
2200
2201                 playerUrl_raw = mMovieParams[0][0]
2202                 self.report_player_url(epTitle)
2203                 try:
2204                         urlHandle = urllib2.urlopen(playerUrl_raw)
2205                         playerUrl = urlHandle.geturl()
2206                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2207                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2208                         return
2209
2210                 uri = mMovieParams[0][1]
2211                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2212                 self.report_index_download(epTitle)
2213                 try:
2214                         indexXml = urllib2.urlopen(indexUrl).read()
2215                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2216                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2217                         return
2218
2219                 results = []
2220
2221                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2222                 itemEls = idoc.findall('.//item')
2223                 for itemEl in itemEls:
2224                         mediaId = itemEl.findall('./guid')[0].text
2225                         shortMediaId = mediaId.split(':')[-1]
2226                         showId = mediaId.split(':')[-2].replace('.com', '')
2227                         officialTitle = itemEl.findall('./title')[0].text
2228                         officialDate = itemEl.findall('./pubDate')[0].text
2229
2230                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2231                                                 urllib.urlencode({'uri': mediaId}))
2232                         configReq = urllib2.Request(configUrl)
2233                         self.report_config_download(epTitle)
2234                         try:
2235                                 configXml = urllib2.urlopen(configReq).read()
2236                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2237                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2238                                 return
2239
2240                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2241                         turls = []
2242                         for rendition in cdoc.findall('.//rendition'):
2243                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2244                                 turls.append(finfo)
2245
2246                         if len(turls) == 0:
2247                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2248                                 continue
2249
2250                         # For now, just pick the highest bitrate
2251                         format,video_url = turls[-1]
2252
2253                         effTitle = showId + u'-' + epTitle
2254                         info = {
2255                                 'id': shortMediaId,
2256                                 'url': video_url,
2257                                 'uploader': showId,
2258                                 'upload_date': officialDate,
2259                                 'title': effTitle,
2260                                 'ext': 'mp4',
2261                                 'format': format,
2262                                 'thumbnail': None,
2263                                 'description': officialTitle,
2264                                 'player_url': playerUrl
2265                         }
2266
2267                         results.append(info)
2268
2269                 return results
2270
2271
2272 class EscapistIE(InfoExtractor):
2273         """Information extractor for The Escapist """
2274
2275         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2276         IE_NAME = u'escapist'
2277
2278         def report_extraction(self, showName):
2279                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2280
2281         def report_config_download(self, showName):
2282                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2283
2284         def _real_extract(self, url):
2285                 mobj = re.match(self._VALID_URL, url)
2286                 if mobj is None:
2287                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2288                         return
2289                 showName = mobj.group('showname')
2290                 videoId = mobj.group('episode')
2291
2292                 self.report_extraction(showName)
2293                 try:
2294                         webPage = urllib2.urlopen(url)
2295                         webPageBytes = webPage.read()
2296                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2297                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2298                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2299                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2300                         return
2301
2302                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2303                 description = unescapeHTML(descMatch.group(1))
2304                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2305                 imgUrl = unescapeHTML(imgMatch.group(1))
2306                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2307                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2308                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2309                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2310
2311                 self.report_config_download(showName)
2312                 try:
2313                         configJSON = urllib2.urlopen(configUrl).read()
2314                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2315                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2316                         return
2317
2318                 # Technically, it's JavaScript, not JSON
2319                 configJSON = configJSON.replace("'", '"')
2320
2321                 try:
2322                         config = json.loads(configJSON)
2323                 except (ValueError,), err:
2324                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2325                         return
2326
2327                 playlist = config['playlist']
2328                 videoUrl = playlist[1]['url']
2329
2330                 info = {
2331                         'id': videoId,
2332                         'url': videoUrl,
2333                         'uploader': showName,
2334                         'upload_date': None,
2335                         'title': showName,
2336                         'ext': 'flv',
2337                         'format': 'flv',
2338                         'thumbnail': imgUrl,
2339                         'description': description,
2340                         'player_url': playerUrl,
2341                 }
2342
2343                 return [info]
2344
2345
2346 class CollegeHumorIE(InfoExtractor):
2347         """Information extractor for collegehumor.com"""
2348
2349         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2350         IE_NAME = u'collegehumor'
2351
2352         def report_webpage(self, video_id):
2353                 """Report information extraction."""
2354                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2355
2356         def report_extraction(self, video_id):
2357                 """Report information extraction."""
2358                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2359
2360         def _real_extract(self, url):
2361                 mobj = re.match(self._VALID_URL, url)
2362                 if mobj is None:
2363                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2364                         return
2365                 video_id = mobj.group('videoid')
2366
2367                 self.report_webpage(video_id)
2368                 request = urllib2.Request(url)
2369                 try:
2370                         webpage = urllib2.urlopen(request).read()
2371                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2372                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2373                         return
2374
2375                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2376                 if m is None:
2377                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2378                         return
2379                 internal_video_id = m.group('internalvideoid')
2380
2381                 info = {
2382                         'id': video_id,
2383                         'internal_id': internal_video_id,
2384                 }
2385
2386                 self.report_extraction(video_id)
2387                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2388                 try:
2389                         metaXml = urllib2.urlopen(xmlUrl).read()
2390                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2392                         return
2393
2394                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2395                 try:
2396                         videoNode = mdoc.findall('./video')[0]
2397                         info['description'] = videoNode.findall('./description')[0].text
2398                         info['title'] = videoNode.findall('./caption')[0].text
2399                         info['url'] = videoNode.findall('./file')[0].text
2400                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2401                         info['ext'] = info['url'].rpartition('.')[2]
2402                         info['format'] = info['ext']
2403                 except IndexError:
2404                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2405                         return
2406
2407                 return [info]
2408
2409
2410 class XVideosIE(InfoExtractor):
2411         """Information extractor for xvideos.com"""
2412
2413         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2414         IE_NAME = u'xvideos'
2415
2416         def report_webpage(self, video_id):
2417                 """Report information extraction."""
2418                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2419
2420         def report_extraction(self, video_id):
2421                 """Report information extraction."""
2422                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2423
2424         def _real_extract(self, url):
2425                 mobj = re.match(self._VALID_URL, url)
2426                 if mobj is None:
2427                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2428                         return
2429                 video_id = mobj.group(1).decode('utf-8')
2430
2431                 self.report_webpage(video_id)
2432
2433                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2434                 try:
2435                         webpage = urllib2.urlopen(request).read()
2436                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2437                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2438                         return
2439
2440                 self.report_extraction(video_id)
2441
2442
2443                 # Extract video URL
2444                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2445                 if mobj is None:
2446                         self._downloader.trouble(u'ERROR: unable to extract video url')
2447                         return
2448                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2449
2450
2451                 # Extract title
2452                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2453                 if mobj is None:
2454                         self._downloader.trouble(u'ERROR: unable to extract video title')
2455                         return
2456                 video_title = mobj.group(1).decode('utf-8')
2457
2458
2459                 # Extract video thumbnail
2460                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2461                 if mobj is None:
2462                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2463                         return
2464                 video_thumbnail = mobj.group(0).decode('utf-8')
2465
2466                 info = {
2467                         'id': video_id,
2468                         'url': video_url,
2469                         'uploader': None,
2470                         'upload_date': None,
2471                         'title': video_title,
2472                         'ext': 'flv',
2473                         'format': 'flv',
2474                         'thumbnail': video_thumbnail,
2475                         'description': None,
2476                         'player_url': None,
2477                 }
2478
2479                 return [info]
2480
2481
2482 class SoundcloudIE(InfoExtractor):
2483         """Information extractor for soundcloud.com
2484            To access the media, the uid of the song and a stream token
2485            must be extracted from the page source and the script must make
2486            a request to media.soundcloud.com/crossdomain.xml. Then
2487            the media can be grabbed by requesting from an url composed
2488            of the stream token and uid
2489          """
2490
2491         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2492         IE_NAME = u'soundcloud'
2493
2494         def __init__(self, downloader=None):
2495                 InfoExtractor.__init__(self, downloader)
2496
2497         def report_webpage(self, video_id):
2498                 """Report information extraction."""
2499                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2500
2501         def report_extraction(self, video_id):
2502                 """Report information extraction."""
2503                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2504
2505         def _real_extract(self, url):
2506                 mobj = re.match(self._VALID_URL, url)
2507                 if mobj is None:
2508                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2509                         return
2510
2511                 # extract uploader (which is in the url)
2512                 uploader = mobj.group(1).decode('utf-8')
2513                 # extract simple title (uploader + slug of song title)
2514                 slug_title =  mobj.group(2).decode('utf-8')
2515                 simple_title = uploader + u'-' + slug_title
2516
2517                 self.report_webpage('%s/%s' % (uploader, slug_title))
2518
2519                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2520                 try:
2521                         webpage = urllib2.urlopen(request).read()
2522                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2523                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2524                         return
2525
2526                 self.report_extraction('%s/%s' % (uploader, slug_title))
2527
2528                 # extract uid and stream token that soundcloud hands out for access
2529                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2530                 if mobj:
2531                         video_id = mobj.group(1)
2532                         stream_token = mobj.group(2)
2533
2534                 # extract unsimplified title
2535                 mobj = re.search('"title":"(.*?)",', webpage)
2536                 if mobj:
2537                         title = mobj.group(1).decode('utf-8')
2538                 else:
2539                         title = simple_title
2540
2541                 # construct media url (with uid/token)
2542                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2543                 mediaURL = mediaURL % (video_id, stream_token)
2544
2545                 # description
2546                 description = u'No description available'
2547                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2548                 if mobj:
2549                         description = mobj.group(1)
2550
2551                 # upload date
2552                 upload_date = None
2553                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2554                 if mobj:
2555                         try:
2556                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2557                         except Exception, e:
2558                                 self._downloader.to_stderr(str(e))
2559
2560                 # for soundcloud, a request to a cross domain is required for cookies
2561                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2562
2563                 return [{
2564                         'id':           video_id.decode('utf-8'),
2565                         'url':          mediaURL,
2566                         'uploader':     uploader.decode('utf-8'),
2567                         'upload_date':  upload_date,
2568                         'title':        title,
2569                         'ext':          u'mp3',
2570                         'format':       u'NA',
2571                         'player_url':   None,
2572                         'description': description.decode('utf-8')
2573                 }]
2574
2575
2576 class InfoQIE(InfoExtractor):
2577         """Information extractor for infoq.com"""
2578
2579         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2580         IE_NAME = u'infoq'
2581
2582         def report_webpage(self, video_id):
2583                 """Report information extraction."""
2584                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2585
2586         def report_extraction(self, video_id):
2587                 """Report information extraction."""
2588                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2589
2590         def _real_extract(self, url):
2591                 mobj = re.match(self._VALID_URL, url)
2592                 if mobj is None:
2593                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2594                         return
2595
2596                 self.report_webpage(url)
2597
2598                 request = urllib2.Request(url)
2599                 try:
2600                         webpage = urllib2.urlopen(request).read()
2601                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2602                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2603                         return
2604
2605                 self.report_extraction(url)
2606
2607
2608                 # Extract video URL
2609                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2610                 if mobj is None:
2611                         self._downloader.trouble(u'ERROR: unable to extract video url')
2612                         return
2613                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2614
2615
2616                 # Extract title
2617                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2618                 if mobj is None:
2619                         self._downloader.trouble(u'ERROR: unable to extract video title')
2620                         return
2621                 video_title = mobj.group(1).decode('utf-8')
2622
2623                 # Extract description
2624                 video_description = u'No description available.'
2625                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2626                 if mobj is not None:
2627                         video_description = mobj.group(1).decode('utf-8')
2628
2629                 video_filename = video_url.split('/')[-1]
2630                 video_id, extension = video_filename.split('.')
2631
2632                 info = {
2633                         'id': video_id,
2634                         'url': video_url,
2635                         'uploader': None,
2636                         'upload_date': None,
2637                         'title': video_title,
2638                         'ext': extension,
2639                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2640                         'thumbnail': None,
2641                         'description': video_description,
2642                         'player_url': None,
2643                 }
2644
2645                 return [info]
2646
2647 class MixcloudIE(InfoExtractor):
2648         """Information extractor for www.mixcloud.com"""
2649         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2650         IE_NAME = u'mixcloud'
2651
2652         def __init__(self, downloader=None):
2653                 InfoExtractor.__init__(self, downloader)
2654
2655         def report_download_json(self, file_id):
2656                 """Report JSON download."""
2657                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2658
2659         def report_extraction(self, file_id):
2660                 """Report information extraction."""
2661                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2662
2663         def get_urls(self, jsonData, fmt, bitrate='best'):
2664                 """Get urls from 'audio_formats' section in json"""
2665                 file_url = None
2666                 try:
2667                         bitrate_list = jsonData[fmt]
2668                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2669                                 bitrate = max(bitrate_list) # select highest
2670
2671                         url_list = jsonData[fmt][bitrate]
2672                 except TypeError: # we have no bitrate info.
2673                         url_list = jsonData[fmt]
2674                 return url_list
2675
2676         def check_urls(self, url_list):
2677                 """Returns 1st active url from list"""
2678                 for url in url_list:
2679                         try:
2680                                 urllib2.urlopen(url)
2681                                 return url
2682                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2683                                 url = None
2684
2685                 return None
2686
2687         def _print_formats(self, formats):
2688                 print 'Available formats:'
2689                 for fmt in formats.keys():
2690                         for b in formats[fmt]:
2691                                 try:
2692                                         ext = formats[fmt][b][0]
2693                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2694                                 except TypeError: # we have no bitrate info
2695                                         ext = formats[fmt][0]
2696                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2697                                         break
2698
2699         def _real_extract(self, url):
2700                 mobj = re.match(self._VALID_URL, url)
2701                 if mobj is None:
2702                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703                         return
2704                 # extract uploader & filename from url
2705                 uploader = mobj.group(1).decode('utf-8')
2706                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2707
2708                 # construct API request
2709                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2710                 # retrieve .json file with links to files
2711                 request = urllib2.Request(file_url)
2712                 try:
2713                         self.report_download_json(file_url)
2714                         jsonData = urllib2.urlopen(request).read()
2715                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2717                         return
2718
2719                 # parse JSON
2720                 json_data = json.loads(jsonData)
2721                 player_url = json_data['player_swf_url']
2722                 formats = dict(json_data['audio_formats'])
2723
2724                 req_format = self._downloader.params.get('format', None)
2725                 bitrate = None
2726
2727                 if self._downloader.params.get('listformats', None):
2728                         self._print_formats(formats)
2729                         return
2730
2731                 if req_format is None or req_format == 'best':
2732                         for format_param in formats.keys():
2733                                 url_list = self.get_urls(formats, format_param)
2734                                 # check urls
2735                                 file_url = self.check_urls(url_list)
2736                                 if file_url is not None:
2737                                         break # got it!
2738                 else:
2739                         if req_format not in formats.keys():
2740                                 self._downloader.trouble(u'ERROR: format is not available')
2741                                 return
2742
2743                         url_list = self.get_urls(formats, req_format)
2744                         file_url = self.check_urls(url_list)
2745                         format_param = req_format
2746
2747                 return [{
2748                         'id': file_id.decode('utf-8'),
2749                         'url': file_url.decode('utf-8'),
2750                         'uploader':     uploader.decode('utf-8'),
2751                         'upload_date': u'NA',
2752                         'title': json_data['name'],
2753                         'ext': file_url.split('.')[-1].decode('utf-8'),
2754                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2755                         'thumbnail': json_data['thumbnail_url'],
2756                         'description': json_data['description'],
2757                         'player_url': player_url.decode('utf-8'),
2758                 }]
2759
2760 class StanfordOpenClassroomIE(InfoExtractor):
2761         """Information extractor for Stanford's Open ClassRoom"""
2762
2763         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2764         IE_NAME = u'stanfordoc'
2765
2766         def report_download_webpage(self, objid):
2767                 """Report information extraction."""
2768                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2769
2770         def report_extraction(self, video_id):
2771                 """Report information extraction."""
2772                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2773
2774         def _real_extract(self, url):
2775                 mobj = re.match(self._VALID_URL, url)
2776                 if mobj is None:
2777                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2778                         return
2779
2780                 if mobj.group('course') and mobj.group('video'): # A specific video
2781                         course = mobj.group('course')
2782                         video = mobj.group('video')
2783                         info = {
2784                                 'id': course + '_' + video,
2785                         }
2786
2787                         self.report_extraction(info['id'])
2788                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2789                         xmlUrl = baseUrl + video + '.xml'
2790                         try:
2791                                 metaXml = urllib2.urlopen(xmlUrl).read()
2792                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2793                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2794                                 return
2795                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2796                         try:
2797                                 info['title'] = mdoc.findall('./title')[0].text
2798                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2799                         except IndexError:
2800                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2801                                 return
2802                         info['ext'] = info['url'].rpartition('.')[2]
2803                         info['format'] = info['ext']
2804                         return [info]
2805                 elif mobj.group('course'): # A course page
2806                         course = mobj.group('course')
2807                         info = {
2808                                 'id': course,
2809                                 'type': 'playlist',
2810                         }
2811
2812                         self.report_download_webpage(info['id'])
2813                         try:
2814                                 coursepage = urllib2.urlopen(url).read()
2815                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2817                                 return
2818
2819                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2820                         if m:
2821                                 info['title'] = unescapeHTML(m.group(1))
2822                         else:
2823                                 info['title'] = info['id']
2824
2825                         m = re.search('<description>([^<]+)</description>', coursepage)
2826                         if m:
2827                                 info['description'] = unescapeHTML(m.group(1))
2828
2829                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2830                         info['list'] = [
2831                                 {
2832                                         'type': 'reference',
2833                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2834                                 }
2835                                         for vpage in links]
2836                         results = []
2837                         for entry in info['list']:
2838                                 assert entry['type'] == 'reference'
2839                                 results += self.extract(entry['url'])
2840                         return results
2841
2842                 else: # Root page
2843                         info = {
2844                                 'id': 'Stanford OpenClassroom',
2845                                 'type': 'playlist',
2846                         }
2847
2848                         self.report_download_webpage(info['id'])
2849                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2850                         try:
2851                                 rootpage = urllib2.urlopen(rootURL).read()
2852                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2854                                 return
2855
2856                         info['title'] = info['id']
2857
2858                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2859                         info['list'] = [
2860                                 {
2861                                         'type': 'reference',
2862                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2863                                 }
2864                                         for cpage in links]
2865
2866                         results = []
2867                         for entry in info['list']:
2868                                 assert entry['type'] == 'reference'
2869                                 results += self.extract(entry['url'])
2870                         return results
2871
2872 class MTVIE(InfoExtractor):
2873         """Information extractor for MTV.com"""
2874
2875         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2876         IE_NAME = u'mtv'
2877
2878         def report_webpage(self, video_id):
2879                 """Report information extraction."""
2880                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2881
2882         def report_extraction(self, video_id):
2883                 """Report information extraction."""
2884                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2885
2886         def _real_extract(self, url):
2887                 mobj = re.match(self._VALID_URL, url)
2888                 if mobj is None:
2889                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2890                         return
2891                 if not mobj.group('proto'):
2892                         url = 'http://' + url
2893                 video_id = mobj.group('videoid')
2894                 self.report_webpage(video_id)
2895
2896                 request = urllib2.Request(url)
2897                 try:
2898                         webpage = urllib2.urlopen(request).read()
2899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2900                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2901                         return
2902
2903                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2904                 if mobj is None:
2905                         self._downloader.trouble(u'ERROR: unable to extract song name')
2906                         return
2907                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2908                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2909                 if mobj is None:
2910                         self._downloader.trouble(u'ERROR: unable to extract performer')
2911                         return
2912                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2913                 video_title = performer + ' - ' + song_name
2914
2915                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2916                 if mobj is None:
2917                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2918                         return
2919                 mtvn_uri = mobj.group(1)
2920
2921                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2922                 if mobj is None:
2923                         self._downloader.trouble(u'ERROR: unable to extract content id')
2924                         return
2925                 content_id = mobj.group(1)
2926
2927                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2928                 self.report_extraction(video_id)
2929                 request = urllib2.Request(videogen_url)
2930                 try:
2931                         metadataXml = urllib2.urlopen(request).read()
2932                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2933                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2934                         return
2935
2936                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2937                 renditions = mdoc.findall('.//rendition')
2938
2939                 # For now, always pick the highest quality.
2940                 rendition = renditions[-1]
2941
2942                 try:
2943                         _,_,ext = rendition.attrib['type'].partition('/')
2944                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2945                         video_url = rendition.find('./src').text
2946                 except KeyError:
2947                         self._downloader.trouble('Invalid rendition field.')
2948                         return
2949
2950                 info = {
2951                         'id': video_id,
2952                         'url': video_url,
2953                         'uploader': performer,
2954                         'title': video_title,
2955                         'ext': ext,
2956                         'format': format,
2957                 }
2958
2959                 return [info]
2960
2961
2962 class YoukuIE(InfoExtractor):
2963
2964         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2965         IE_NAME = u'Youku'
2966
2967         def __init__(self, downloader=None):
2968                 InfoExtractor.__init__(self, downloader)
2969
2970         def report_download_webpage(self, file_id):
2971                 """Report webpage download."""
2972                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2973
2974         def report_extraction(self, file_id):
2975                 """Report information extraction."""
2976                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
2977
2978         def _gen_sid(self):
2979                 nowTime = int(time.time() * 1000)
2980                 random1 = random.randint(1000,1998)
2981                 random2 = random.randint(1000,9999)
2982
2983                 return "%d%d%d" %(nowTime,random1,random2)
2984
2985         def _get_file_ID_mix_string(self, seed):
2986                 mixed = []
2987                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2988                 seed = float(seed)
2989                 for i in range(len(source)):
2990                         seed  =  (seed * 211 + 30031 ) % 65536
2991                         index  =  math.floor(seed / 65536 * len(source) )
2992                         mixed.append(source[int(index)])
2993                         source.remove(source[int(index)])
2994                 #return ''.join(mixed)
2995                 return mixed
2996
2997         def _get_file_id(self, fileId, seed):
2998                 mixed = self._get_file_ID_mix_string(seed)
2999                 ids = fileId.split('*')
3000                 realId = []
3001                 for ch in ids:
3002                         if ch:
3003                                 realId.append(mixed[int(ch)])
3004                 return ''.join(realId)
3005
3006         def _real_extract(self, url):
3007                 mobj = re.match(self._VALID_URL, url)
3008                 if mobj is None:
3009                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3010                         return
3011                 video_id = mobj.group('ID')
3012
3013                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3014
3015                 request = urllib2.Request(info_url, None, std_headers)
3016                 try:
3017                         self.report_download_webpage(video_id)
3018                         jsondata = urllib2.urlopen(request).read()
3019                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3020                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3021                         return
3022
3023                 self.report_extraction(video_id)
3024                 try:
3025                         config = json.loads(jsondata)
3026
3027                         video_title =  config['data'][0]['title']
3028                         seed = config['data'][0]['seed']
3029
3030                         format = self._downloader.params.get('format', None)
3031                         supported_format = config['data'][0]['streamfileids'].keys()
3032
3033                         if format is None or format == 'best':
3034                                 if 'hd2' in supported_format:
3035                                         format = 'hd2'
3036                                 else:
3037                                         format = 'flv'
3038                                 ext = u'flv'
3039                         elif format == 'worst':
3040                                 format = 'mp4'
3041                                 ext = u'mp4'
3042                         else:
3043                                 format = 'flv'
3044                                 ext = u'flv'
3045
3046
3047                         fileid = config['data'][0]['streamfileids'][format]
3048                         seg_number = len(config['data'][0]['segs'][format])
3049
3050                         keys=[]
3051                         for i in xrange(seg_number):
3052                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3053
3054                         #TODO check error
3055                         #youku only could be viewed from mainland china
3056                 except:
3057                         self._downloader.trouble(u'ERROR: unable to extract info section')
3058                         return
3059
3060                 files_info=[]
3061                 sid = self._gen_sid()
3062                 fileid = self._get_file_id(fileid, seed)
3063
3064                 #column 8,9 of fileid represent the segment number
3065                 #fileid[7:9] should be changed
3066                 for index, key in enumerate(keys):
3067
3068                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3069                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3070
3071                         info = {
3072                                 'id': '%s_part%02d' % (video_id, index),
3073                                 'url': download_url,
3074                                 'uploader': None,
3075                                 'title': video_title,
3076                                 'ext': ext,
3077                                 'format': u'NA'
3078                         }
3079                         files_info.append(info)
3080
3081                 return files_info
3082
3083
3084 class XNXXIE(InfoExtractor):
3085         """Information extractor for xnxx.com"""
3086
3087         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3088         IE_NAME = u'xnxx'
3089         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3090         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3091         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3092
3093         def report_webpage(self, video_id):
3094                 """Report information extraction"""
3095                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3096
3097         def report_extraction(self, video_id):
3098                 """Report information extraction"""
3099                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3100
3101         def _real_extract(self, url):
3102                 mobj = re.match(self._VALID_URL, url)
3103                 if mobj is None:
3104                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105                         return
3106                 video_id = mobj.group(1).decode('utf-8')
3107
3108                 self.report_webpage(video_id)
3109
3110                 # Get webpage content
3111                 try:
3112                         webpage = urllib2.urlopen(url).read()
3113                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3115                         return
3116
3117                 result = re.search(self.VIDEO_URL_RE, webpage)
3118                 if result is None:
3119                         self._downloader.trouble(u'ERROR: unable to extract video url')
3120                         return
3121                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3122
3123                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3124                 if result is None:
3125                         self._downloader.trouble(u'ERROR: unable to extract video title')
3126                         return
3127                 video_title = result.group(1).decode('utf-8')
3128
3129                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3130                 if result is None:
3131                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3132                         return
3133                 video_thumbnail = result.group(1).decode('utf-8')
3134
3135                 info = {'id': video_id,
3136                                 'url': video_url,
3137                                 'uploader': None,
3138                                 'upload_date': None,
3139                                 'title': video_title,
3140                                 'ext': 'flv',
3141                                 'format': 'flv',
3142                                 'thumbnail': video_thumbnail,
3143                                 'description': None,
3144                                 'player_url': None}
3145
3146                 return [info]